replacing label access by an optimized version with code replication. It also works with optimization -O2
Signed-off-by: Alexandre Amory <alexandre.amory@santannapisa.it>
diff --git a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/generators/LabelGenerator.xtend b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/generators/LabelGenerator.xtend
index 497b97d..e11191c 100644
--- a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/generators/LabelGenerator.xtend
+++ b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/generators/LabelGenerator.xtend
@@ -17,7 +17,7 @@
 
 import java.util.List
 import org.eclipse.app4mc.amalthea.model.Label
-import org.eclipse.app4mc.slg.commons.m2t.AmaltheaModelUtils
+//import org.eclipse.app4mc.slg.commons.m2t.AmaltheaModelUtils
 
 class LabelGenerator {
 
@@ -28,9 +28,10 @@
 
 	// ---------- names of generated 'C' functions ----------
 
-	static def String initCall(Label label) 				'''initialize_«label.name»()'''	
-	static def String readCall(Label label, String param) 	'''read_«label.name»(«param»)'''	
-	static def String writeCall(Label label, String param)	'''write_«label.name»(«param»)'''	
+	// not used
+	//static def String initCall(Label label) 				'''initialize_«label.name»()'''	
+	//static def String readCall(Label label, String param) 	'''read_«label.name»(«param»)'''	
+	//static def String writeCall(Label label, String param)	'''write_«label.name»(«param»)'''	
 
 	// ---------- generate file contents ----------
 
@@ -49,94 +50,12 @@
 				)	
 	'''
 
-	static def String toH(Label label)
-	'''
-		void «initCall(label)»;
-		void «readCall(label, "int labelAccessStatistics")»;
-		void «writeCall(label, "int labelAccessStatistics")»;
-		
-	'''
-
-	static def String toCpp(Label label) {
+	static def String toH(Label label){
 		val name = if (label?.name.isNullOrEmpty) "<undefined label>" else label.name
-		val numberOfBytes = if (label?.size === null) 0 else label.size.numberBytes
-
+		val numberOfBytes = if (label?.size === null) 1 else label.size.numberBytes
+		
 		'''
-		int «name»[«AmaltheaModelUtils.getLabelArraySize(label)»];	
-		
-		static bool isIinitialized_«name» = false;
-		void «initCall(label)» {
-			if (!isIinitialized_«name»){
-				for (int i=0; i < «AmaltheaModelUtils.getLabelArraySize(label)»; i++){
-					«name»[i] = i+1;
-				}
-				isIinitialized_«name» = true;
-			}
-		}
-		
-		
-		void «readCall(label, "int labelAccessStatistics")» {
-			int numberOfBytes = «numberOfBytes»;
-			
-			for (int repeat = 0 ; repeat < labelAccessStatistics; repeat++){
-				if(numberOfBytes < 4){
-					numberOfBytes = 4;
-				}
-				int arraysize = sizeof(«name») / 4;
-				
-				//printf("number of bytes:%d\n",arraysize);
-				int leftOverElements=arraysize%10;
-				
-				int arraySizeWith10Multiples=arraysize-leftOverElements;
-				int i = 0;
-				int a = 0;
-				for (i = 0; i < arraySizeWith10Multiples; i = i + 10) {           //iteration with 10 reads
-					a = «name»[i];
-					a = «name»[i+1];
-					a = «name»[i+2];
-					a = «name»[i+3];
-					a = «name»[i+4];
-					a = «name»[i+5];
-					a = «name»[i+6];
-					a = «name»[i+7];
-					a = «name»[i+8];
-					a = «name»[i+9];
-				}
-				for(;i<arraysize;i++){
-					a = «name»[i];
-				}
-			}
-		}
-		
-		void «writeCall(label, "int labelAccessStatistics")» {
-			int numberOfBytes = «numberOfBytes»;
-			for (int repeat = 0 ; repeat < labelAccessStatistics; repeat++){
-				if(numberOfBytes < 4){
-					numberOfBytes = 4;
-				}
-				int arraysize = sizeof(«name») / 4;
-				int leftOverElements=arraysize%10;
-				int arraySizeWith10Multiples=arraysize-leftOverElements;
-				
-				int i = 0;
-				for (i = 0; i < arraySizeWith10Multiples; i = i + 10) {
-					«name»[i]   = 0xAFFE;
-					«name»[i+1] = 0xAFFE;
-					«name»[i+2] = 0xAFFE;
-					«name»[i+3] = 0xAFFE;
-					«name»[i+4] = 0xAFFE;
-					«name»[i+5] = 0xAFFE;
-					«name»[i+6] = 0xAFFE;
-					«name»[i+7] = 0xAFFE;
-					«name»[i+8] = 0xAFFE;
-					«name»[i+9] = 0xAFFE;
-				}
-				for(;i<arraysize;i++){
-						«name»[i]=0xAFFE;
-				}
-			}
-		}
-		
+			uint8_t «name»[«numberOfBytes»];
 		'''
 	}
 
diff --git a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelAccessTransformer.java b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelAccessTransformer.java
index d5969a1..7ec573c 100644
--- a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelAccessTransformer.java
+++ b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelAccessTransformer.java
@@ -61,13 +61,15 @@
 		}
 
 		final NumericStatistic stat = statisticValueOrNull(labelAccess);
-		final String statString = (stat == null) ? "1" : getNumericStatistic(stat);
+		//final String statString = (stat == null) ? "1" : getNumericStatistic(stat);
+		// possible BUG fix ?!?! 
+		final String statString = String.valueOf( labelAccess.getData().getSize().getNumberBytes() );
 
 		switch (labelAccess.getAccess()) {
 		case READ:
-			return "read_" + labelAccess.getData().getName() + "(" + statString + ")"; 
+			return "read_label (" + labelAccess.getData().getName() + "," + statString + ")"; 
 		case WRITE:
-			return "write_" + labelAccess.getData().getName() + "(" + statString + ")"; 
+			return "write_label (" + labelAccess.getData().getName() + "," + statString + ")"; 
 		default:
 			return null;
 		}
diff --git a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelTransformer.java b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelTransformer.java
index 1e3efe1..67babc5 100644
--- a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelTransformer.java
+++ b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/LabelTransformer.java
@@ -90,15 +90,117 @@
 	protected void genFiles(SLGTranslationUnit tu, Label label) {
 		if (isSrcFileEmpty(tu)) {
 			srcAppend(tu, "#include \"" + getIncFile(tu) + "\"\n\n");
+			toCpp(tu);
 		}
 		if (isIncFileEmpty(tu)) {	// Mc: ???
-			incAppend(tu, "#include <stdbool.h>\n\n");
+			incAppend(tu, "#include <stdint.h>\n\n");
+			incAppend(tu, "uint64_t read_label(uint8_t *p, int size);\n");
+			incAppend(tu, "uint64_t write_label(uint8_t *p, int size);\n\n");
 		}
 
 		incAppend(tu, LabelGenerator.toH(label));
-		srcAppend(tu, LabelGenerator.toCpp(label));
+		//srcAppend(tu, LabelGenerator.toCpp(label));
 	}
 
+	// When compiled with -02, 'read_label' main loop is like this, mostly 8 * 64bits memory reads per iteration 
+	// 
+	//.L2:
+
+	//	add    rcx,rdi
+	//	cmp    rdi,r8
+	//	jae    4c <read_label(unsigned char*, int)+0x4c>
+	//	mov    rdx,QWORD PTR [rdi+0x8]
+	//	add    rdi,0x40
+	//	add    rdx,QWORD PTR [rdi-0x40]
+	//	add    rdx,QWORD PTR [rdi-0x30]
+	//	add    rdx,QWORD PTR [rdi-0x28]
+	//	add    rdx,QWORD PTR [rdi-0x20]
+	//	add    rdx,QWORD PTR [rdi-0x18]
+	//	add    rdx,QWORD PTR [rdi-0x10]
+	//	add    rdx,QWORD PTR [rdi-0x8]
+	//	add    rax,rdx
+	//	cmp    r8,rdi
+	//	ja     20 <read_label(unsigned char*, int)+0x20>	/
+	//....
+	//
+	//
+	// 'write_label' main loop is like this, mostly 8 * 64bits memory writes per iteration 
+	//
+	//  	cmp    rdi,rcx
+	//  	jae    12e <write_label(unsigned char*, int)+0x4e>
+	//  	xchg   ax,ax
+	//  	mov    esi,0xaffffffe
+	//  	add    rdi,0x40
+	//  	mov    QWORD PTR [rdi-0x40],rsi
+	//  	mov    QWORD PTR [rdi-0x38],rsi
+	//  	mov    QWORD PTR [rdi-0x30],rsi
+	//  	mov    QWORD PTR [rdi-0x28],rsi
+	//  	mov    QWORD PTR [rdi-0x20],rsi
+	//  	mov    QWORD PTR [rdi-0x18],rsi
+	//  	mov    QWORD PTR [rdi-0x10],rsi
+	//  	mov    QWORD PTR [rdi-0x8],rsi
+	//  	cmp    rcx,rdi
+	//  	ja     100 <write_label(unsigned char*, int)+0x20>
+	//  	cmp    rax,rdx	
+	// 		...
+	
+	protected void toCpp(SLGTranslationUnit tu) {
+		srcAppend(tu, "uint64_t read_label(uint8_t *p, int size){\n");
+		srcAppend(tu, "  register uint64_t *p_start= (uint64_t*)p;\n");
+		srcAppend(tu, "  // 64 is the number of bytes moved inside the 1st while loop\n");
+		srcAppend(tu, "  uint64_t left_over = size & (64-1);\n");
+		srcAppend(tu, "  uint64_t multiple_of_64 = size - left_over;\n");
+		srcAppend(tu, "  register uint64_t *p_end_multiple_of_64= (uint64_t*)&(p[multiple_of_64-1]);\n");
+		srcAppend(tu, "  // pointers to the 2nd loop\n");
+		srcAppend(tu, "  register uint8_t *p_start2= &(p[multiple_of_64]);\n");
+		srcAppend(tu, "  register uint8_t *p_end= &(p[size-1]);\n");
+		srcAppend(tu, "  register uint64_t val=0;\n");
+		srcAppend(tu, "  // executes 8 moves of 8 bytes each\n");
+		srcAppend(tu, "  while(p_start<p_end_multiple_of_64){\n");
+		srcAppend(tu, "	val += p_start[0] + p_start[1] + p_start[2] + p_start[3] + \n");
+		srcAppend(tu, "		   p_start[4] + p_start[5] + p_start[6] + p_start[7];\n");
+		srcAppend(tu, "	p_start += 8;\n");
+		srcAppend(tu, "  }\n");
+		srcAppend(tu, "  // executes the remaining moves, byte by byte\n");
+		srcAppend(tu, "  while(p_start2<=p_end){\n");
+		srcAppend(tu, "	val += *p_start2;\n");
+		srcAppend(tu, "	p_start2 ++;\n");
+		srcAppend(tu, "  }\n");
+		srcAppend(tu, "  return val;\n");
+		srcAppend(tu, "}\n\n");
+		
+		srcAppend(tu, "uint64_t write_label(uint8_t *p, int size){\n");
+		srcAppend(tu, "    register uint64_t *p_start= (uint64_t*)p;\n");
+		srcAppend(tu, "    // 64 is the number of bytes moved inside the 1st while loop\n");
+		srcAppend(tu, "    uint64_t left_over = size & (64-1);\n");
+		srcAppend(tu, "    uint64_t multiple_of_64 = size - left_over;\n");
+		srcAppend(tu, "    register uint64_t *p_end_multiple_of_64= (uint64_t*)&(p[multiple_of_64-1]);\n");
+		srcAppend(tu, "    // pointers to the 2nd loop\n");
+		srcAppend(tu, "    register uint8_t *p_start2= &(p[multiple_of_64]);\n");
+		srcAppend(tu, "    register uint8_t *p_end= &(p[size-1]);\n");
+		srcAppend(tu, "    register uint64_t val=0;\n");
+		srcAppend(tu, "    // executes 8 moves of 8 bytes each\n");
+		srcAppend(tu, "    while(p_start<p_end_multiple_of_64){\n");
+		srcAppend(tu, "      p_start[0] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start[1] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start[2] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start[3] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start[4] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start[5] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start[6] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start[7] = 0xAFFFFFFE;\n");
+		srcAppend(tu, "      p_start +=8;\n");
+		srcAppend(tu, "    }\n");
+		srcAppend(tu, "    // executes the remaining moves, byte by byte\n");
+		srcAppend(tu, "    while(p_start2<=p_end){\n");
+		srcAppend(tu, "      *(p_start2)   = 0xAF;\n");
+		srcAppend(tu, "      p_start2 ++;\n");
+		srcAppend(tu, "    }\n");
+		srcAppend(tu, "    return val;\n");
+		srcAppend(tu, "}\n\n");
+				
+	}
+	
 	public boolean createCMake() {
 		return outputBuffer.appendTo(
 				"OTHER", MAKEFILE_PATH, LabelGenerator.toCMake(LIB_NAME, getSrcFiles()));
diff --git a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/TaskTransformer.java b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/TaskTransformer.java
index d303391..4bc5b4a 100644
--- a/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/TaskTransformer.java
+++ b/load_generator/commons/plugins/org.eclipse.app4mc.slg.commons.m2t/src/org/eclipse/app4mc/slg/commons/m2t/transformers/sw/TaskTransformer.java
@@ -121,7 +121,7 @@
 				SLGTranslationUnit labelTU = entry.getValue();
 
 				includes.add(getIncFile(labelTU));
-				initCalls.add(LabelGenerator.initCall(label));
+				//initCalls.add(LabelGenerator.initCall(label));
 			}
 		}
 
diff --git a/load_generator/linux/plugins/org.eclipse.app4mc.slg.linux/src/org/eclipse/app4mc/slg/linux/generators/LinuxMakeGenerator.xtend b/load_generator/linux/plugins/org.eclipse.app4mc.slg.linux/src/org/eclipse/app4mc/slg/linux/generators/LinuxMakeGenerator.xtend
index 47853b3..cabb07e 100644
--- a/load_generator/linux/plugins/org.eclipse.app4mc.slg.linux/src/org/eclipse/app4mc/slg/linux/generators/LinuxMakeGenerator.xtend
+++ b/load_generator/linux/plugins/org.eclipse.app4mc.slg.linux/src/org/eclipse/app4mc/slg/linux/generators/LinuxMakeGenerator.xtend
@@ -35,6 +35,8 @@
 		runnables.o: synthetic_gen/runnables/_src/runnables.c
 			gcc -c «IF !experimentalCodeMatching»-Isynthetic_gen/labels/_inc -Isynthetic_gen/ticksUtils/_inc«ENDIF»  -Isynthetic_gen/runnables/_inc  «IF experimentalCodeMatching»-Isynthetic_gen/codesnippets/_inc«ENDIF» synthetic_gen/runnables/_src/runnables.c -O2
 		
+		clean:
+			rm -f *.o ./synthetic
 		
 		«IF !experimentalCodeMatching»
 			labels.o: synthetic_gen/labels/_src/labels.c
diff --git a/load_generator/linux/releng/org.eclipse.app4mc.slg.linux.product/additionalTestData/minimal.amxmi b/load_generator/linux/releng/org.eclipse.app4mc.slg.linux.product/additionalTestData/minimal.amxmi
new file mode 100644
index 0000000..6c8f336
--- /dev/null
+++ b/load_generator/linux/releng/org.eclipse.app4mc.slg.linux.product/additionalTestData/minimal.amxmi
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<am:Amalthea xmi:version="2.0" xmlns:xmi="http://www.omg.org/XMI" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:am="http://app4mc.eclipse.org/amalthea/1.0.0">
+  <swModel>
+    <tasks xmi:id="single_task?type=Task" name="single_task" stimuli="stim_100ms?type=PeriodicStimulus" multipleTaskActivationLimit="0">
+      <activityGraph>
+        <items xsi:type="am:RunnableCall" runnable="simple_runnable?type=Runnable"/>
+        <items xsi:type="am:RunnableCall" runnable="rand_runnable?type=Runnable"/>
+      </activityGraph>
+    </tasks>
+    <runnables xmi:id="simple_runnable?type=Runnable" name="simple_runnable" callback="false" service="false">
+      <activityGraph>
+        <items xsi:type="am:LabelAccess" data="vet?type=Label" access="read"/>
+        <items xsi:type="am:LabelAccess" data="scalar32?type=Label" access="read"/>
+        <items xsi:type="am:LabelAccess" data="scalar16?type=Label" access="read"/>
+        <items xsi:type="am:Ticks">
+          <default xsi:type="am:DiscreteValueConstant" value="1000000"/>
+        </items>
+        <items xsi:type="am:LabelAccess" data="scalar8?type=Label" access="write"/>
+      </activityGraph>
+    </runnables>
+    <runnables xmi:id="rand_runnable?type=Runnable" name="rand_runnable" callback="false" service="false">
+      <activityGraph>
+        <items xsi:type="am:LabelAccess" data="vet?type=Label" access="read"/>
+        <items xsi:type="am:LabelAccess" data="scalar16?type=Label" access="read"/>
+        <items xsi:type="am:Ticks">
+          <default xsi:type="am:DiscreteValueStatistics" lowerBound="100000" upperBound="500000" average="150000.0"/>
+        </items>
+        <items xsi:type="am:LabelAccess" data="scalar32?type=Label" access="write"/>
+      </activityGraph>
+    </runnables>
+    <labels xmi:id="vet?type=Label" name="vet" constant="false" bVolatile="false">
+      <size value="100" unit="B"/>
+    </labels>
+    <labels xmi:id="scalar16?type=Label" name="scalar16" constant="false" bVolatile="false">
+      <size value="16" unit="bit"/>
+    </labels>
+    <labels xmi:id="scalar8?type=Label" name="scalar8" constant="false" bVolatile="false">
+      <size value="1" unit="B"/>
+    </labels>
+    <labels xmi:id="scalar32?type=Label" name="scalar32" constant="false" bVolatile="false">
+      <size value="4" unit="B"/>
+    </labels>
+  </swModel>
+  <hwModel>
+    <definitions xsi:type="am:ProcessingUnitDefinition" xmi:id="slow_core?type=ProcessingUnitDefinition" name="slow_core" puType="CPU"/>
+    <definitions xsi:type="am:ProcessingUnitDefinition" xmi:id="fast_core?type=ProcessingUnitDefinition" name="fast_core" puType="CPU"/>
+    <structures xmi:id="no-name?type=HwStructure" name="">
+      <modules xsi:type="am:ProcessingUnit" xmi:id="slow1?type=ProcessingUnit" name="slow1" frequencyDomain="slow_freq?type=FrequencyDomain" definition="slow_core?type=ProcessingUnitDefinition"/>
+      <modules xsi:type="am:ProcessingUnit" xmi:id="fast1?type=ProcessingUnit" name="fast1" definition="fast_core?type=ProcessingUnitDefinition"/>
+    </structures>
+    <domains xsi:type="am:FrequencyDomain" xmi:id="slow_freq?type=FrequencyDomain" name="slow_freq" clockGating="false">
+      <defaultValue value="100.0" unit="MHz"/>
+    </domains>
+    <domains xsi:type="am:FrequencyDomain" xmi:id="fast_freq?type=FrequencyDomain" name="fast_freq" clockGating="false">
+      <defaultValue value="1.0" unit="GHz"/>
+    </domains>
+  </hwModel>
+  <osModel>
+    <operatingSystems name="os">
+      <taskSchedulers xmi:id="task_os?type=TaskScheduler" name="task_os"/>
+    </operatingSystems>
+  </osModel>
+  <stimuliModel>
+    <stimuli xsi:type="am:PeriodicStimulus" xmi:id="stim_100ms?type=PeriodicStimulus" name="stim_100ms">
+      <recurrence value="100" unit="ms"/>
+    </stimuli>
+  </stimuliModel>
+  <mappingModel>
+    <taskAllocation task="single_task?type=Task" scheduler="task_os?type=TaskScheduler" affinity="fast1?type=ProcessingUnit"/>
+  </mappingModel>
+</am:Amalthea>
diff --git a/load_generator/linux/releng/org.eclipse.app4mc.slg.linux.product/additionalTestData/readme b/load_generator/linux/releng/org.eclipse.app4mc.slg.linux.product/additionalTestData/readme
new file mode 100644
index 0000000..8414ce0
--- /dev/null
+++ b/load_generator/linux/releng/org.eclipse.app4mc.slg.linux.product/additionalTestData/readme
@@ -0,0 +1 @@
+minimal - use it to test labels of different sizes, runnable with constant, rand, and extended ticks.