tgt/tgt-git-update

diff --git a/doc/targets.conf.example b/doc/targets.conf.example
index 46be8fe..ac8cf69 100644
--- a/doc/targets.conf.example
+++ b/doc/targets.conf.example
@@ -81,6 +81,7 @@ default-driver iscsi
 # Note that some parameters (write-cache, scsi_sn) were specified "globally".
 # "Global" parameters will be applied to all LUNs; they can be overwritten
 # "locally", per LUN.
+# If lun is not specified, it will be allocated automatically (first available).

 <target iqn.2008-09.com.example:server.target5>

@@ -88,26 +89,26 @@ default-driver iscsi
 	vendor_id VENDOR1
 	removable 1
 	device-type cd
-	# lun 1		# Not yet supported
+	lun 1
     </direct-store>

     <direct-store /dev/sda>
 	vendor_id VENDOR2
-	# lun 2		# Not yet supported
+	lun 2
     </direct-store>

     <backing-store /dev/sdb1>
 	vendor_id back1
 	scsi_sn SERIAL
 	write-cache on
-	# lun 3		# Not yet supported
+	# lun 3		# lun is commented out - will be allocated automatically
     </backing-store>

     <backing-store /dev/sdd1>
 	vendor_id back2
 	#mode_page 8:0:18:0x10:0:0xff....
 	#mode_page 8:0:18:0x10:0:0xff....
-	# lun 4		# Not yet supported
+	lun 15
     </backing-store>

     # Some more parameters which can be specified locally or globally:
@@ -123,11 +124,12 @@ default-driver iscsi
     #mode_page 8:0:18:0x10:0:0xff....
     #mode_page 8:0:18:0x10:0:0xff....
     #device-type
+    #allow-in-use	# if specified globally, can't be overwritten locally

     write-cache off
     scsi_sn multipath-10

-    # Parameters below are global. They can't be configured per LUN.
+    # Parameters below are only global. They can't be configured per LUN.
     # Only allow connections from 192.168.100.1 and 192.168.200.5
     initiator-address 192.168.100.1
     initiator-address 192.168.200.5
@@ -142,10 +144,47 @@ default-driver iscsi
 </target>


+# The device will have lun 1 unless you specify something else
+<target iqn.2008-09.com.example:server.target6>
+    backing-store /dev/LVM/somedevice
+    lun 10
+</target>
+
+
+# Devices which are in use (by system: mounted, for swap, part of RAID, or by
+# userspace: dd, by tgtd for another target etc.) can't be used, unless you use
+# --force flag or add 'allow-in-use yes' option
+<target iqn.2008-09.com.example:server.target7>
+    backing-store /dev/LVM/somedevice
+    allow-in-use yes
+</target>
+
+<target iqn.2008-09.com.example:server.target8>
+    <backing-store /dev/LVM/somedevice>
+        scsi_sn serial1
+    </backing-store>
+
+    <backing-store /dev/LVM/somedevice2>
+        scsi_sn serial2
+    </backing-store>
+
+    allow-in-use yes
+</target>
+
+
+
+

 # Not supported configurations, and therefore, commented out:

-#<target iqn.2008-09.com.example:server.target6>
+#<target iqn.2008-09.com.example:server.target9>
+#    backing-store /dev/LVM/somedevice1
+#    backing-store /dev/LVM/somedevice2
+#    lun 10
+#    lun 11
+#</target>
+
+#<target iqn.2008-09.com.example:server.target10>
 #    <direct-store /dev/sdd>
 #        vendor_id VENDOR1
 #    </direct-store>
@@ -155,7 +194,7 @@ default-driver iscsi

 # This one will break the parser:

-#<target iqn.2008-09.com.example:server.target7>
+#<target iqn.2008-09.com.example:server.target11>
 #    <direct-store /dev/sdd>
 #        vendor_id VENDOR1
 #    </direct-store>
diff --git a/scripts/tgt-admin b/scripts/tgt-admin
index e4be373..c352952 100755
--- a/scripts/tgt-admin
+++ b/scripts/tgt-admin
@@ -125,7 +125,7 @@ sub process_targets {
 sub parse_configs {
 	# Parse the config
 	if ($alternate_conf ne 0) {
-		# Check if alternative configuration file exist
+		# Check if alternative configuration file exists
 		if (-e "$alternate_conf") {
 			execute("# Using $alternate_conf as configuration file\n");
 			%conf = ParseConfig(-ConfigFile => "$alternate_conf", -UseApacheInclude => 1, -IncludeGlob => 1,);
@@ -211,6 +211,7 @@ sub add_targets {
 					# and other parameters which can be specified globally
 					my %target_options;
 					my $target_options_ref;
+					my $data_key;
 					foreach my $k3 (sort keys %{$conf{$k}{$k2}}) {
 						$lun = 1;
 						$option = $k3;
@@ -218,6 +219,7 @@ sub add_targets {
 						check_value($value);
 						$target_options{$option} = $value;
 						$target_options_ref = \%target_options;
+						$data_key = make_key($target_options_ref, "lun", "allow-in-use");
 					}

 					if (not defined $target_options{"driver"}) {
@@ -230,7 +232,7 @@ sub add_targets {
 						$option = $k3;
 						$value = $conf{$k}{$k2}{$k3};
 						check_value($value);
-						process_options($target_options_ref);
+						process_options($target_options_ref,$data_key);
 						# If there was no option called "initiator-address", it means
 						# we want to allow ALL initiators for this target
 						if ($option eq "initiator-address") {
@@ -258,6 +260,27 @@ sub add_targets {
 	}
 }

+# Pre-parse the config and get some values we need
+sub make_key {
+	my $target_options_ref = shift;
+	my @actions = @_;
+	my %data_key;
+
+	foreach my $action (@actions) {
+		if (ref $$target_options_ref{'backing-store'} eq "HASH") {
+			foreach my $testlun (keys %{$$target_options_ref{'backing-store'}}) {
+				$data_key{$testlun}{$action} = $$target_options_ref{'backing-store'}{$testlun}{$action};
+			}
+		}
+		if (ref $$target_options_ref{'direct-store'} eq "HASH") {
+			foreach my $testlun (keys %{$$target_options_ref{'direct-store'}}) {
+				$data_key{$testlun}{$action} = $$target_options_ref{'direct-store'}{$testlun}{$action};
+			}
+		}
+	}
+	return \%data_key;
+}
+
 # Some options can be specified only once
 sub check_if_hash_array {
 	my $check = $_[0];
@@ -285,9 +308,15 @@ sub check_exe {
 	foreach my $path (@path) {
 		if ( -x "$path/$command" && -f "$path/$command" ) { $exists = 1 }
 	}
-	if ( $exists == 0 ) {
-		print "Command $command (needed by $option option in your config file) is not in your path - can't continue!\n";
-		exit 1;
+	if ($exists == 0) {
+		if ($command eq "sg_inq") {
+			print "Command '$command' (needed by '$option') is not in your path - can't continue!\n";
+			exit 1;
+		} elsif ($command eq "lsof") {
+			execute("# Command '$command' is not in your path.");
+			execute("# Can't reliably check if device is not in use.");
+			return 1;
+		}
 	}
 }

@@ -315,27 +344,61 @@ sub add_params {
 	}
 }

+# Find next available LUN
+sub find_next_lun {
+	my $backing_store = $_[0];
+	my $data_key_ref = $_[1];
+	my $lun_collision = 0;
+	my $lun_is_free = 0;
+	my $found_lun = 1;
+	while ($lun_is_free == 0) {
+		foreach my $testlun (keys %$data_key_ref) {
+			foreach my $testlun2 (values %{$$data_key_ref{$testlun}}) {
+				if ($found_lun eq $testlun2) {
+					$lun_collision = 1;
+				}
+			}
+		}
+		if ($lun_collision == 0) {
+			$lun_is_free = 1;
+		} else {
+			$found_lun += 1;
+		}
+		$lun_collision = 0;
+	}
+	$$data_key_ref{$backing_store}{'lun'} = $found_lun;
+	return $found_lun;
+}
+
 # Add backing or direct store
 sub add_backing_direct {
 	my $backing_store = $_[0];
 	my $target_options_ref = $_[1];
-	my $lun = $_[2];
+	my $lun;
+	my $data_key_ref = $_[2];
 	my $direct_store = $_[3];
 	my $driver = $$target_options_ref{"driver"};

 	# Is the device in use?
-	(my $can_alloc, my $dev) = check_device($backing_store);
+	my $can_alloc = 1;
+	if ($force != 1 && $$target_options_ref{'allow-in-use'} ne "yes") {
+		$can_alloc = check_device($backing_store,$data_key_ref);
+	}

-	# Needed if the config file has mixed definitions
-	if (ref($backing_store) eq "HASH") {
-		foreach my $backing_store (sort keys %$value) {
-			add_backing_direct($backing_store,$target_options_ref,$lun,$direct_store);
-			$lun += 1;
-		}
-		return $lun;
-	} elsif (-e $backing_store && $can_alloc == 1) {
+	if (-e $backing_store && ! -d $backing_store && $can_alloc == 1) {
 		my @exec_commands;
 		my $device_type;
+		my %luns;
+		my @added_luns;
+		# Find out LUNs which are "reserved" in the config file
+		if (ref $value eq "HASH") {
+			if (length $$data_key_ref{$backing_store}{'lun'}) {
+				$lun = $$data_key_ref{$backing_store}{'lun'};
+			} else {
+				# Find an available lun if it wasn't specified
+				$lun = find_next_lun($backing_store,$data_key_ref);
+			}
+		}
 		# Process parameters for each lun / backing store
 		if (ref $value eq "HASH") {
 			my %params_added;
@@ -447,6 +510,11 @@ sub add_backing_direct {
 				check_if_hash_array($$target_options_ref{"device-type"}, "device-type");
 				$device_type = $$target_options_ref{"device-type"};
 			}
+			# lun
+			if (length $$target_options_ref{"lun"}) {
+				check_if_hash_array($$target_options_ref{"lun"}, "lun");
+				$lun = $$target_options_ref{"lun"};
+			}
 		} else {
 			print "If you got here, this means your config file is not supported.\n";
 			print "Please report it to stgt mailing list and attach your config files.\n";
@@ -461,7 +529,9 @@ sub add_backing_direct {
 		$lun += 1;
 		return $lun;
 	} elsif ($can_alloc == 0) {
-		execute("# Skipping device $backing_store ($dev is mounted / in use)");
+		execute("# Skipping device $backing_store - it is in use.");
+		execute("# You can override it with --force or 'allow-in-use yes' config option.");
+		execute("# Note - do so only if you know what you're doing, you may damage your data.");
 	} else {
 		execute("# Skipping device: $backing_store");
 		execute("# $backing_store does not exist - please check the configuration file");
@@ -471,11 +541,12 @@ sub add_backing_direct {
 # Process options from the config file
 sub process_options {
 	my $target_options_ref = $_[0];
+	my $data_key_ref = $_[1];
 	my $driver = $$target_options_ref{"driver"};
 	if ($option eq "backing-store" || $option eq "direct-store") {
 		my $direct_store = 0;
 		if ($option eq "direct-store") {
-			check_exe("sg_inq", "direct-store");
+			check_exe("sg_inq", "option direct-store");
 			$direct_store = 1;
 		}

@@ -495,7 +566,13 @@ sub process_options {

 		if (ref($value) eq "HASH") {
 			foreach my $backing_store (sort keys %$value) {
-				$lun = add_backing_direct($backing_store,$target_options_ref,$lun,$direct_store);
+				if ($backing_store =~ m/HASH/) {
+					print "\nYour config file is not supported. See targets.conf.example for details.\n";
+					exit 1;
+				}
+			}
+			foreach my $backing_store (sort keys %$value) {
+				add_backing_direct($backing_store,$target_options_ref,$data_key_ref,$direct_store);
 			}
 		}
 	}
@@ -569,7 +646,7 @@ sub dump_config {

 	my @all_targets = keys %tgtadm_output_tid;

-	# If all targets use the same driver, us it only once in the config
+	# If all targets use the same driver, use it only once in the config
 	my $skip_driver = 0;
 	my @drivers_combined;
 	foreach my $current_target (@all_targets) {
@@ -976,63 +1053,43 @@ sub check_connected {
 }

 # Check if a device can be allocated
-my @rootfs_dev;
+# Device can be used "by system" (i.e. mounted, used as swap, as a part of
+# a RAID array etc.) or "by user" - i.e., already by tgtd, or someone doing:
+#    dd if=/dev/1st_device of=/dev/2nd_device
+# We shouldn't allow a device to be used more than one time, as it could
+# cause corruption when written several times. Unless the user really wants to.
 sub check_device {
-	my $tmp_dev = $_[0];
-
-	# Check if force flag is set
-	if ( $force == 0) {
-		# Check for rootfs devices
-		&find_rootfs_device();
-		$tmp_dev =~ s/\d//g;
-		# Check if device is on the same disk as rootfs
-		if (grep {$_ eq $tmp_dev} @rootfs_dev) {
-			return (0,$tmp_dev);
-		}
-	}
-	return 1;
-}
-
-# finds all the devices that rootfs is mounted on
-sub find_rootfs_device {
-	my @files=("/etc/mtab","/proc/mounts");
-	my @lines;
-	# read files
-	foreach my $file (@files){
-		if (open(FH,"$file")) {
-			@lines=(@lines,<FH>);
-			close (FH);
-		}
-	}
+	my $backing_store = $_[0];
+	my $data_key_ref = $_[1];

-	# parse files and finds all the device which mounted on /
-	foreach my $line (@lines){
-		chomp $line;
-		if (($line=~/^\/dev\//) && ($line=~/ \/ /)){
-			my @ln=split(' ',$line);
-			$ln[0]=~s/\d//g;
-			push(@rootfs_dev,$ln[0]);
-		}
+	# If allow-in-use is "yes", there is no need to do
+	# farther tests
+	if ($$data_key_ref{$backing_store}{'allow-in-use'} eq "yes") {
+		return 1;
 	}

-	# read swap file
-	my $swap_file="/proc/swap";
-	if (open(FH,"$swap_file")) {
-		@lines=<FH>;
-		close (FH);
+	# Check if the system uses this device
+	use Fcntl qw(O_RDONLY O_EXCL);
+	use Errno;
+	sysopen(FH, $backing_store, O_RDONLY | O_EXCL);
+	if ($!{EBUSY}) {
+		execute("# Device $backing_store is used by the system (mounted, used by swap?).");
+		return 0;
 	}
-	# parse swap file and finds all the swap devices
-	foreach my $line (@lines){
-		chomp $line;
-		if ($line=~/^\/dev\//) {
-			my @ln=split(' ',$line);
-			$ln[0]=~s/\d//g;
-			push(@rootfs_dev,$ln[0]);
+	close(FH);
+
+	# Check if userspace uses this device
+	my $lsof_check = check_exe("lsof");
+	if ($lsof_check ne 1) {
+		system("lsof $backing_store &>/dev/null");
+		my $exit_value  = $? >> 8;
+		if ($exit_value eq 0) {
+			execute("# Device $backing_store is used (already tgtd target?).");
+			execute("# Run 'lsof $backing_store' to see the details.");
+			return 0;
 		}
 	}
-	# remove duplicate entries from @rootfs_dev
-	my %seen = ();
-	@rootfs_dev = grep { ! $seen{ $_ }++ } @rootfs_dev;
+	return 1;
 }

 # Execute or just print (or both) everything we start or would start
diff --git a/usr/Makefile b/usr/Makefile
index 82ddf07..a59364b 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -58,7 +58,7 @@ PROGRAMS += tgtd tgtadm
 SCRIPTS += ../scripts/tgt-setup-lun ../scripts/tgt-admin
 TGTD_OBJS += tgtd.o mgmt.o target.o scsi.o log.o driver.o util.o work.o \
 		parser.o spc.o sbc.o mmc.o osd.o scc.o smc.o ssc.o bs_ssc.o \
-		bs.o
+		bs_null.o bs.o
 MANPAGES = ../doc/manpages/tgtadm.8 ../doc/manpages/tgt-admin.8 \
 		../doc/manpages/tgt-setup-lun.8

diff --git a/usr/be_byteshift.h b/usr/be_byteshift.h
index 5c6a619..82b7da6 100644
--- a/usr/be_byteshift.h
+++ b/usr/be_byteshift.h
@@ -40,6 +40,11 @@ static inline uint16_t get_unaligned_be16(const void *p)
 	return __get_unaligned_be16((const uint8_t *)p);
 }

+static inline uint32_t get_unaligned_be24(const uint8_t *p)
+{
+	return p[0] << 16 | p[1] << 8 | p[2];
+}
+
 static inline uint32_t get_unaligned_be32(const void *p)
 {
 	return __get_unaligned_be32((const uint8_t *)p);
@@ -55,6 +60,13 @@ static inline void put_unaligned_be16(uint16_t val, void *p)
 	__put_unaligned_be16(val, p);
 }

+static inline void put_unaligned_be24(uint32_t val, void *p)
+{
+	((uint8_t *)p)[0] = (val >> 16) & 0xff;
+	((uint8_t *)p)[1] = (val >> 8) & 0xff;
+	((uint8_t *)p)[2] = val & 0xff;
+}
+
 static inline void put_unaligned_be32(uint32_t val, void *p)
 {
 	__put_unaligned_be32(val, p);
diff --git a/usr/bs.c b/usr/bs.c
index cef7b19..542ef55 100644
--- a/usr/bs.c
+++ b/usr/bs.c
@@ -173,7 +173,8 @@ static void *bs_thread_worker_fn(void *arg)
 	return NULL;
 }

-int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn)
+int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn,
+		   int nr_threads)
 {
 	int i, ret;

@@ -205,12 +206,18 @@ int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn)
 	if (ret)
 		goto event_del;

-	for (i = 0; i < ARRAY_SIZE(info->worker_thread); i++) {
+	if (nr_threads > ARRAY_SIZE(info->worker_thread)) {
+		eprintf("too many threads %d\n", nr_threads);
+		nr_threads = ARRAY_SIZE(info->worker_thread);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
 		ret = pthread_create(&info->worker_thread[i], NULL,
 				     bs_thread_worker_fn, info);
 		if (ret)
 			goto destroy_threads;
 	}
+
 rewrite:
 	ret = write(info->command_fd[1], &ret, sizeof(ret));
 	if (ret < 0) {
@@ -261,7 +268,8 @@ void bs_thread_close(struct bs_thread_info *info)
 	info->stop = 1;
 	pthread_cond_broadcast(&info->pending_cond);

-	for (i = 0; i < ARRAY_SIZE(info->worker_thread); i++)
+	for (i = 0; info->worker_thread[i] &&
+		     i < ARRAY_SIZE(info->worker_thread); i++)
 		pthread_join(info->worker_thread[i], NULL);

 	pthread_cond_destroy(&info->finished_cond);
diff --git a/usr/bs_mmap.c b/usr/bs_mmap.c
index fff19d3..bb24f5e 100644
--- a/usr/bs_mmap.c
+++ b/usr/bs_mmap.c
@@ -96,7 +96,7 @@ static void bs_mmap_close(struct scsi_lu *lu)
 static int bs_mmap_init(struct scsi_lu *lu)
 {
 	struct bs_thread_info *info = BS_THREAD_I(lu);
-	return bs_thread_open(info, bs_mmap_request);
+	return bs_thread_open(info, bs_mmap_request, NR_WORKER_THREADS);
 }

 static void bs_mmap_exit(struct scsi_lu *lu)
diff --git a/usr/bs_null.c b/usr/bs_null.c
new file mode 100644
index 0000000..00137ff
--- /dev/null
+++ b/usr/bs_null.c
@@ -0,0 +1,68 @@
+/*
+ * NULL I/O backing store routine
+ *
+ * Copyright (C) 2008 Alexander Nezhinsky <nezhinsky@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "list.h"
+#include "tgtd.h"
+#include "scsi.h"
+
+#define NULL_BS_DEV_SIZE        (1ULL << 40)
+
+int bs_null_cmd_submit(struct scsi_cmd *cmd)
+{
+	scsi_set_result(cmd, SAM_STAT_GOOD);
+	return 0;
+}
+
+static int bs_null_open(struct scsi_lu *lu, char *path,
+			int *fd, uint64_t *size)
+{
+	*size = NULL_BS_DEV_SIZE;
+	dprintf("NULL backing store open, size: %" PRIu64 "\n", *size);
+	return 0;
+}
+
+static void bs_null_close(struct scsi_lu *lu)
+{
+}
+
+static int bs_null_cmd_done(struct scsi_cmd *cmd)
+{
+	return 0;
+}
+
+static struct backingstore_template null_bst = {
+	.bs_name		= "null",
+	.bs_datasize		= 0,
+	.bs_open		= bs_null_open,
+	.bs_close		= bs_null_close,
+	.bs_cmd_submit		= bs_null_cmd_submit,
+	.bs_cmd_done		= bs_null_cmd_done,
+};
+
+__attribute__((constructor)) static void bs_null_constructor(void)
+{
+	register_backingstore_template(&null_bst);
+}
diff --git a/usr/bs_rdwr.c b/usr/bs_rdwr.c
index e2ece4a..65a6136 100644
--- a/usr/bs_rdwr.c
+++ b/usr/bs_rdwr.c
@@ -147,7 +147,7 @@ static int bs_rdwr_init(struct scsi_lu *lu)
 {
 	struct bs_thread_info *info = BS_THREAD_I(lu);

-	return bs_thread_open(info, bs_rdwr_request);
+	return bs_thread_open(info, bs_rdwr_request, NR_WORKER_THREADS);
 }

 static void bs_rdwr_exit(struct scsi_lu *lu)
diff --git a/usr/bs_ssc.c b/usr/bs_ssc.c
index dcc3e30..b2e8818 100644
--- a/usr/bs_ssc.c
+++ b/usr/bs_ssc.c
@@ -208,7 +208,7 @@ static void bs_ssc_close(struct scsi_lu *lu)
 static int bs_ssc_init(struct scsi_lu *lu)
 {
 	struct bs_thread_info *info = BS_THREAD_I(lu);
-	return bs_thread_open(info, ssc_rdwr_request);
+	return bs_thread_open(info, ssc_rdwr_request, 1);
 }

 static void bs_ssc_exit(struct scsi_lu *lu)
diff --git a/usr/bs_thread.h b/usr/bs_thread.h
index b97861c..b2975a5 100644
--- a/usr/bs_thread.h
+++ b/usr/bs_thread.h
@@ -33,7 +33,8 @@ static inline struct bs_thread_info *BS_THREAD_I(struct scsi_lu *lu)
 	return (struct bs_thread_info *) ((char *)lu + sizeof(*lu));
 }

-extern int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn);
+extern int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn,
+			  int nr_threads);
 extern void bs_thread_close(struct bs_thread_info *info);
 extern int bs_thread_cmd_submit(struct scsi_cmd *cmd);

diff --git a/usr/iscsi/iscsi_rdma.c b/usr/iscsi/iscsi_rdma.c
index 46e6ea8..d3b5147 100644
--- a/usr/iscsi/iscsi_rdma.c
+++ b/usr/iscsi/iscsi_rdma.c
@@ -144,6 +144,8 @@ struct conn_info {
 	/* but count so we can drain CQ on close */
 	int recvl_posted;

+	struct tgt_event tx_sched;
+
 	/* login phase resources, freed at full-feature */
 	void *srbuf_login;
 	void *listbuf_login;
@@ -194,6 +196,8 @@ struct iser_device {
 	void *mempool_listbuf;
 	struct ibv_mr *mempool_mr;

+	struct tgt_event poll_sched;
+
 	/* free and allocated mempool entries */
 	struct list_head mempool_free, mempool_alloc;
 };
@@ -217,10 +221,6 @@ static struct list_head iser_conn_list;
 /* if any task needs an rdma read or write slot to proceed */
 static int waiting_rdma_slot;

-/* progress available, used with tgt_counter_event */
-static int num_tx_ready;
-static int num_rx_ready;
-
 #define uint64_from_ptr(p) (uint64_t)(uintptr_t)(p)
 #define ptr_from_int64(p) (void *)(unsigned long)(p)

@@ -251,6 +251,9 @@ static int num_rx_ready;
 #define RDMA_PER_CONN 20
 #define RDMA_TRANSFER_SIZE (512 * 1024)

+
+#define MAX_POLL_WC 8
+
 /*
  * Number of allocatable data buffers, each of this size.  Do at least 128
  * for linux iser.  The mempool size is rounded up at initialization time
@@ -270,13 +273,17 @@ static inline struct conn_info *RDMA_CONN(struct iscsi_connection *conn)
 	return container_of(conn, struct conn_info, iscsi_conn);
 }

-static void iser_cqe_handler(int fd, int events, void *data);
-static void iser_rx_progress(int *counter, void *data);
+static void iser_cqe_handler(int fd __attribute__((unused)),
+			     int events __attribute__((unused)),
+			     void *data);
 static void iser_rdma_read_completion(struct rdmalist *rdma);
 static void iscsi_rdma_release(struct iscsi_connection *conn);
 static int iscsi_rdma_show(struct iscsi_connection *conn, char *buf,
 			   int rest);
 static void iscsi_rdma_event_modify(struct iscsi_connection *conn, int events);
+static void iser_sched_poll_cq(struct tgt_event *tev);
+static void iser_sched_consume_cq(struct tgt_event *tev);
+static void iser_sched_tx(struct tgt_event *evt);

 /*
  * Called when ready for full feature, builds resources.
@@ -612,6 +619,8 @@ static int iser_device_init(struct iser_device *dev)
 		goto out;
 	}

+	tgt_init_sched_event(&dev->poll_sched, iser_sched_poll_cq, dev);
+
 	ret = ibv_req_notify_cq(dev->cq, 0);
 	if (ret) {
 		eprintf("ibv_req_notify failed: %s\n", strerror(ret));
@@ -691,6 +700,9 @@ static void iser_accept_connection(struct rdma_cm_event *event)
 	ci->login_phase = LOGIN_PHASE_START;
 	INIT_LIST_HEAD(&ci->conn_tx_ready);
 	list_add(&ci->iser_conn_list, &temp_conn);
+
+	tgt_init_sched_event(&ci->tx_sched, iser_sched_tx, ci);
+
 	/* initiator sits at dst, we are src */
 	memcpy(&ci->peer_addr, &event->id->route.addr.dst_addr,
 	       sizeof(ci->peer_addr));
@@ -940,7 +952,7 @@ static void handle_wc(struct ibv_wc *wc)
 		list_add(&rdmal->list, &ci->rdmal);
 		if (waiting_rdma_slot) {
 			waiting_rdma_slot = 0;
-			num_tx_ready = 1;
+			tgt_add_sched_event(&ci->tx_sched);
 		}
 		break;

@@ -957,7 +969,7 @@ static void handle_wc(struct ibv_wc *wc)
 		list_add(&rdmal->list, &ci->rdmal);
 		if (waiting_rdma_slot) {
 			waiting_rdma_slot = 0;
-			num_tx_ready = 1;
+			tgt_add_sched_event(&ci->tx_sched);
 		}
 		break;

@@ -974,85 +986,14 @@ close_err:
 }

 /*
- * Called directly from main event loop when a CQ notification is
- * available.
- */
-static void iser_cqe_handler(int fd __attribute__((unused)),
-			     int events __attribute__((unused)),
-			     void *data)
-{
-	int ret;
-	void *cq_context;
-	struct iser_device *dev = data;
-
-	ret = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context);
-	if (ret != 0) {
-		eprintf("notification, but no CQ event\n");
-		exit(1);
-	}
-
-	ibv_ack_cq_events(dev->cq, 1);
-
-	ret = ibv_req_notify_cq(dev->cq, 0);
-	if (ret) {
-		eprintf("ibv_req_notify_cq: %s\n", strerror(ret));
-		exit(1);
-	}
-
-	iser_rx_progress(NULL, dev);
-}
-
-/*
- * Called from tgtd when num_tx_ready (counter) non-zero.  Walks the
- * list of active connections and tries to push tx on each, until nothing
- * is ready anymore.  No progress limit here.
- */
-static void iser_tx_progress(int *counter __attribute__((unused)),
-			     void *data __attribute__((unused)))
-{
-	int reloop, ret;
-	struct conn_info *ci, *cin;
-	struct iscsi_connection *conn;
-
-	dprintf("entry\n");
-	num_tx_ready = 0;
-
-	do {
-		reloop = 0;
-		list_for_each_entry_safe(ci, cin, &conn_tx_ready,
-					 conn_tx_ready) {
-			conn = &ci->iscsi_conn;
-			if (conn->state == STATE_CLOSE) {
-				dprintf("ignoring tx for closed conn\n");
-			} else {
-				dprintf("trying tx\n");
-				ret = iscsi_tx_handler(conn);
-				if (conn->state == STATE_CLOSE) {
-					conn_close(conn);
-					dprintf("connection %p closed\n", ci);
-				} else {
-					if (ret == 0) {
-						reloop = 1;
-					} else {
-						/* but leave on tx ready list */
-						waiting_rdma_slot = 1;
-					}
-				}
-			}
-		}
-	} while (reloop);
-}
-
-/*
  * Could read as many entries as possible without blocking, but
  * that just fills up a list of tasks.  Instead pop out of here
  * so that tx progress, like issuing rdma reads and writes, can
  * happen periodically.
  */
-#define MAX_RX_PROGRESS 8
-static void iser_rx_progress_one(struct iser_device *dev)
+static int iser_poll_cq(struct iser_device *dev, int max_wc)
 {
-	int ret, numwc = 0;
+	int ret = 0, numwc = 0;
 	struct ibv_wc wc;
 	struct conn_info *ci;
 	struct recvlist *recvl;
@@ -1069,8 +1010,8 @@ static void iser_rx_progress_one(struct iser_device *dev)
 		VALGRIND_MAKE_MEM_DEFINED(&wc, sizeof(wc));
 		if (wc.status == IBV_WC_SUCCESS) {
 			handle_wc(&wc);
-			if (++numwc == MAX_RX_PROGRESS) {
-				num_rx_ready = 1;
+			if (++numwc == max_wc) {
+				ret = 1;
 				break;
 			}
 		} else if (wc.status == IBV_WC_WR_FLUSH_ERR) {
@@ -1089,23 +1030,114 @@ static void iser_rx_progress_one(struct iser_device *dev)
 				wc.status, (unsigned long long) wc.wr_id);
 		}
 	}
+	return ret;
+}
+
+static void iser_poll_cq_armable(struct iser_device *dev)
+{
+	int ret;
+
+	ret = iser_poll_cq(dev, MAX_POLL_WC);
+	if (ret < 0)
+		exit(1);
+
+	if (ret == 0) {
+		/* no more completions on cq, arm the completion interrupts */
+		ret = ibv_req_notify_cq(dev->cq, 0);
+		if (ret) {
+			eprintf("ibv_req_notify_cq: %s\n", strerror(ret));
+			exit(1);
+		}
+		dev->poll_sched.sched_handler = iser_sched_consume_cq;
+	} else
+		dev->poll_sched.sched_handler = iser_sched_poll_cq;
+
+	tgt_add_sched_event(&dev->poll_sched);
+}
+
+/* Scheduled to poll cq after a completion event has been
+   received and acknowledged, if no more completions are found
+   the interrupts are re-armed */
+static void iser_sched_poll_cq(struct tgt_event *tev)
+{
+	struct iser_device *dev = tev->data;
+	iser_poll_cq_armable(dev);
+}
+
+/* Scheduled to consume completion events that could arrive
+   after the cq had been seen empty but just before
+   the notification interrupts were re-armed.
+   Intended to consume those remaining completions only,
+   this function does not re-arm interrupts. */
+static void iser_sched_consume_cq(struct tgt_event *tev)
+{
+	struct iser_device *dev = tev->data;
+	int ret;
+
+	ret = iser_poll_cq(dev, MAX_POLL_WC);
+	if (ret < 0)
+		exit(1);
+}
+
+/*
+ * Called directly from main event loop when a CQ notification is
+ * available.
+ */
+static void iser_cqe_handler(int fd __attribute__((unused)),
+			     int events __attribute__((unused)),
+			     void *data)
+{
+	struct iser_device *dev = data;
+	void *cq_context;
+	int ret;
+
+	ret = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context);
+	if (ret != 0) {
+		eprintf("notification, but no CQ event\n");
+		exit(1);
+	}
+
+	ibv_ack_cq_events(dev->cq, 1);
+
+	/* if a poll was previosuly scheduled, remove it,
+	   as it will be scheduled when necessary */
+	if (dev->poll_sched.scheduled)
+		tgt_remove_sched_event(&dev->poll_sched);
+
+	iser_poll_cq_armable(dev);
 }

 /*
- * Only one progress counter, must look across all devs.
+ * Called from tgtd as a scheduled event
+ * tries to push tx on a connection, until nothing
+ * is ready anymore.  No progress limit here.
  */
-static void iser_rx_progress(int *counter __attribute__((unused)), void *data)
+static void iser_sched_tx(struct tgt_event *evt)
 {
-	struct iser_device *dev;
+	struct conn_info *ci = evt->data;
+	struct iscsi_connection *conn = &ci->iscsi_conn;
+	int ret;

 	dprintf("entry\n");
-	num_rx_ready = 0;
-	if (data == NULL) {
-		list_for_each_entry(dev, &iser_dev_list, list)
-			iser_rx_progress_one(dev);
-	} else {
-		dev = data;
-		iser_rx_progress_one(dev);
+
+	if (conn->state == STATE_CLOSE) {
+		dprintf("ignoring tx for closed conn\n");
+		return;
+	}
+
+	for (;;) {
+		dprintf("trying tx\n");
+		ret = iscsi_tx_handler(conn);
+		if (conn->state == STATE_CLOSE) {
+			conn_close(conn);
+			dprintf("connection %p closed\n", ci);
+			break;
+		}
+		if (ret != 0) {
+			/* but leave on tx ready list */
+			waiting_rdma_slot = 1;
+			break;
+		}
 	}
 }

@@ -1165,10 +1197,7 @@ static int iscsi_rdma_init(void)
 	INIT_LIST_HEAD(&iser_dev_list);
 	INIT_LIST_HEAD(&iser_conn_list);
 	INIT_LIST_HEAD(&temp_conn);
-	num_tx_ready = 0;
-	num_rx_ready = 0;
-	ret = tgt_counter_event_add(&num_tx_ready, iser_tx_progress, NULL);
-	ret = tgt_counter_event_add(&num_rx_ready, iser_rx_progress, NULL);
+
 	return ret;
 }

@@ -1397,10 +1426,6 @@ static void iscsi_iser_write_end(struct iscsi_connection *conn)

 	ci->writeb = 0;  /* reset count */
 	ci->send_comm_event = NULL;
-
-	/* wake up the progress engine to do the done */
-	dprintf("inc progress to finish cmd\n");
-	num_tx_ready = 1;
 }

 /*
@@ -1505,7 +1530,7 @@ static int iscsi_rdma_rdma_write(struct iscsi_connection *conn)
 		iscsi_rdma_event_modify(conn, EPOLLIN);
 	} else {
 		/* poke ourselves to do the next rdma */
-		num_tx_ready = 1;
+		tgt_add_sched_event(&ci->tx_sched);
 	}

 	return ret;
@@ -1628,7 +1653,7 @@ static void iscsi_rdma_event_modify(struct iscsi_connection *conn, int events)
 			dprintf("tx ready adding %p\n", ci);
 			list_add(&ci->conn_tx_ready, &conn_tx_ready);
 		}
-		num_tx_ready = 1;
+		tgt_add_sched_event(&ci->tx_sched);
 	} else {
 		dprintf("tx ready removing %p\n", ci);
 		list_del_init(&ci->conn_tx_ready);
diff --git a/usr/log.c b/usr/log.c
index 076c770..056314a 100644
--- a/usr/log.c
+++ b/usr/log.c
@@ -24,6 +24,7 @@
 #include <unistd.h>
 #include <syslog.h>
 #include <signal.h>
+#include <errno.h>
 #include <sys/shm.h>
 #include <sys/ipc.h>
 #include <sys/types.h>
@@ -52,29 +53,39 @@ static int logarea_init (int size)
 	logdbg(stderr,"enter logarea_init\n");

 	if ((shmid = shmget(IPC_PRIVATE, sizeof(struct logarea),
-			    0644 | IPC_CREAT | IPC_EXCL)) == -1)
+			    0644 | IPC_CREAT | IPC_EXCL)) == -1) {
+		syslog(LOG_ERR, "shmget logarea failed %d", errno);
 		return 1;
+	}

 	la = shmat(shmid, NULL, 0);
-	if (!la)
+	if (!la) {
+		syslog(LOG_ERR, "shmat logarea failed %d", errno);
 		return 1;
+	}
+
+	shmctl(shmid, IPC_RMID, NULL);

 	if (size < MAX_MSG_SIZE)
 		size = LOG_SPACE_SIZE;

 	if ((shmid = shmget(IPC_PRIVATE, size,
 			    0644 | IPC_CREAT | IPC_EXCL)) == -1) {
+		syslog(LOG_ERR, "shmget msg failed %d", errno);
 		shmdt(la);
 		return 1;
 	}

 	la->start = shmat(shmid, NULL, 0);
 	if (!la->start) {
+		syslog(LOG_ERR, "shmat msg failed %d", errno);
 		shmdt(la);
 		return 1;
 	}
 	memset(la->start, 0, size);

+	shmctl(shmid, IPC_RMID, NULL);
+
 	la->empty = 1;
 	la->end = la->start + size;
 	la->head = la->start;
@@ -82,18 +93,23 @@ static int logarea_init (int size)

 	if ((shmid = shmget(IPC_PRIVATE, MAX_MSG_SIZE + sizeof(struct logmsg),
 			    0644 | IPC_CREAT | IPC_EXCL)) == -1) {
+		syslog(LOG_ERR, "shmget logmsg failed %d", errno);
 		shmdt(la->start);
 		shmdt(la);
 		return 1;
 	}
 	la->buff = shmat(shmid, NULL, 0);
 	if (!la->buff) {
+		syslog(LOG_ERR, "shmat logmsgfailed %d", errno);
 		shmdt(la->start);
 		shmdt(la);
 		return 1;
 	}

+	shmctl(shmid, IPC_RMID, NULL);
+
 	if ((la->semid = semget(SEMKEY, 1, 0666 | IPC_CREAT)) < 0) {
+		syslog(LOG_ERR, "semget failed %d", errno);
 		shmdt(la->buff);
 		shmdt(la->start);
 		shmdt(la);
@@ -102,6 +118,7 @@ static int logarea_init (int size)

 	la->semarg.val=1;
 	if (semctl(la->semid, 0, SETVAL, la->semarg) < 0) {
+		syslog(LOG_ERR, "semctl failed %d", errno);
 		shmdt(la->buff);
 		shmdt(la->start);
 		shmdt(la);
diff --git a/usr/spc.c b/usr/spc.c
index 60fd7d7..ac5c3de 100644
--- a/usr/spc.c
+++ b/usr/spc.c
@@ -383,6 +383,9 @@ int spc_mode_select(int host_no, struct scsi_cmd *cmd,
 		if (block_descriptor_len != BLOCK_DESCRIPTOR_LEN)
 			goto sense;

+		memcpy(cmd->dev->mode_block_descriptor, data + offset,
+		       BLOCK_DESCRIPTOR_LEN);
+
 		offset += 8;
 	}

diff --git a/usr/tgtd.c b/usr/tgtd.c
index 0b1cb4c..62aaa04 100644
--- a/usr/tgtd.c
+++ b/usr/tgtd.c
@@ -38,26 +38,13 @@
 #include "work.h"
 #include "util.h"

-struct tgt_event {
-	union {
-		event_handler_t *handler;
-		counter_event_handler_t *counter_handler;
-	};
-	union {
-		int fd;
-		int *counter;
-	};
-	void *data;
-	struct list_head e_list;
-};
-
 unsigned long pagesize, pageshift, pagemask;

 int system_active = 1;
 static int ep_fd;
 static char program_name[] = "tgtd";
 static LIST_HEAD(tgt_events_list);
-static LIST_HEAD(tgt_counter_events_list);
+static LIST_HEAD(tgt_sched_events_list);

 static struct option const long_options[] =
 {
@@ -136,22 +123,6 @@ int tgt_event_add(int fd, int events, event_handler_t handler, void *data)
 	return err;
 }

-int tgt_counter_event_add(int *counter, counter_event_handler_t handler,
-			  void *data)
-{
-	struct tgt_event *tev;
-
-	tev = zalloc(sizeof(*tev));
-	if (!tev)
-		return -ENOMEM;
-
-	tev->data = data;
-	tev->counter_handler = handler;
-	tev->counter = counter;
-	list_add(&tev->e_list, &tgt_counter_events_list);
-	return 0;
-}
-
 static struct tgt_event *tgt_event_lookup(int fd)
 {
 	struct tgt_event *tev;
@@ -163,17 +134,6 @@ static struct tgt_event *tgt_event_lookup(int fd)
 	return NULL;
 }

-static struct tgt_event *tgt_counter_event_lookup(int *counter)
-{
-	struct tgt_event *tev;
-
-	list_for_each_entry(tev, &tgt_counter_events_list, e_list) {
-		if (tev->counter == counter)
-			return tev;
-	}
-	return NULL;
-}
-
 void tgt_event_del(int fd)
 {
 	struct tgt_event *tev;
@@ -189,20 +149,6 @@ void tgt_event_del(int fd)
 	free(tev);
 }

-void tgt_counter_event_del(int *counter)
-{
-	struct tgt_event *tev;
-
-	tev = tgt_counter_event_lookup(counter);
-	if (!tev) {
-		eprintf("Cannot find counter event %p\n", counter);
-		return;
-	}
-
-	list_del(&tev->e_list);
-	free(tev);
-}
-
 int tgt_event_modify(int fd, int events)
 {
 	struct epoll_event ev;
@@ -221,26 +167,62 @@ int tgt_event_modify(int fd, int events)
 	return epoll_ctl(ep_fd, EPOLL_CTL_MOD, fd, &ev);
 }

+void tgt_init_sched_event(struct tgt_event *evt,
+			  sched_event_handler_t sched_handler, void *data)
+{
+	evt->sched_handler = sched_handler;
+	evt->scheduled = 0;
+	evt->data = data;
+	INIT_LIST_HEAD(&evt->e_list);
+}
+
+void tgt_add_sched_event(struct tgt_event *evt)
+{
+	if (!evt->scheduled) {
+		evt->scheduled = 1;
+		list_add_tail(&evt->e_list, &tgt_sched_events_list);
+	}
+}
+
+void tgt_remove_sched_event(struct tgt_event *evt)
+{
+	if (evt->scheduled) {
+		evt->scheduled = 0;
+		list_del_init(&evt->e_list);
+	}
+}
+
+static int tgt_exec_scheduled(void)
+{
+	struct list_head *last_sched;
+	struct tgt_event *tev, *tevn;
+	int work_remains = 0;
+
+	if (!list_empty(&tgt_sched_events_list)) {
+		/* execute only work scheduled till now */
+		last_sched = tgt_sched_events_list.prev;
+		list_for_each_entry_safe(tev, tevn, &tgt_sched_events_list,
+					 e_list) {
+			tgt_remove_sched_event(tev);
+			tev->sched_handler(tev);
+			if (&tev->e_list == last_sched)
+				break;
+		}
+		if (!list_empty(&tgt_sched_events_list))
+			work_remains = 1;
+	}
+	return work_remains;
+}
+
 static void event_loop(void)
 {
-	int nevent, i, done, timeout = TGTD_TICK_PERIOD * 1000;
+	int nevent, i, sched_remains, timeout;
 	struct epoll_event events[1024];
-	struct tgt_event *tev, *tevn;
+	struct tgt_event *tev;

 retry:
-	/*
-	 * Check the counter events to see if they have any work to run.
-	 */
-	do {
-		done = 1;
-		list_for_each_entry_safe(tev, tevn, &tgt_counter_events_list,
-					e_list) {
-			if (*tev->counter) {
-				done = 0;
-				tev->counter_handler(tev->counter, tev->data);
-			}
-		}
-	} while (!done);
+	sched_remains = tgt_exec_scheduled();
+	timeout = sched_remains ? 0 : TGTD_TICK_PERIOD * 1000;

 	nevent = epoll_wait(ep_fd, events, ARRAY_SIZE(events), timeout);
 	if (nevent < 0) {
diff --git a/usr/tgtd.h b/usr/tgtd.h
index 4febcd3..da751c8 100644
--- a/usr/tgtd.h
+++ b/usr/tgtd.h
@@ -206,13 +206,20 @@ extern int tgt_bind_host_to_target(int tid, int host_no);
 extern int tgt_unbind_host_to_target(int tid, int host_no);
 extern int tgt_bound_target_lookup(int host_no);

-typedef void (event_handler_t)(int fd, int events, void *data);
-typedef void (counter_event_handler_t)(int *counter, void *data);
+struct tgt_event;
+typedef void (*sched_event_handler_t)(struct tgt_event *tev);
+
+extern void tgt_init_sched_event(struct tgt_event *evt,
+			  sched_event_handler_t sched_handler, void *data);
+
+typedef void (*event_handler_t)(int fd, int events, void *data);
+
 extern int tgt_event_add(int fd, int events, event_handler_t handler, void *data);
-extern int tgt_counter_event_add(int *counter, counter_event_handler_t handler,
-				 void *data);
 extern void tgt_event_del(int fd);
-extern void tgt_counter_event_del(int *counter);
+
+extern void tgt_add_sched_event(struct tgt_event *evt);
+extern void tgt_remove_sched_event(struct tgt_event *evt);
+
 extern int tgt_event_modify(int fd, int events);
 extern int target_cmd_queue(int tid, struct scsi_cmd *cmd);
 extern void target_cmd_done(struct scsi_cmd *cmd);
@@ -262,4 +269,17 @@ extern int dtd_load_unload(int tid, uint64_t lun, int load, char *file);
 extern int register_backingstore_template(struct backingstore_template *bst);
 extern struct backingstore_template *get_backingstore_template(const char *name);

+struct tgt_event {
+	union {
+		event_handler_t handler;
+		sched_event_handler_t sched_handler;
+	};
+	union {
+		int fd;
+		int scheduled;
+	};
+	void *data;
+	struct list_head e_list;
+};
+
 #endif