diff --git a/MultiClassTsetlinMachine.c b/MultiClassTsetlinMachine.c
index 2b21baa..a8c71a2 100644
--- a/MultiClassTsetlinMachine.c
+++ b/MultiClassTsetlinMachine.c
@@ -101,10 +101,13 @@ void mc_tm_update(struct MultiClassTsetlinMachine *mc_tm, int Xi[], int target_c
 	tm_update(mc_tm->tsetlin_machines[target_class], Xi, 1, s);
 
 	// Randomly pick one of the other classes, for pairwise learning of class output 
-	unsigned int negative_target_class = (unsigned int)CLASSES * 1.0*rand()/((unsigned int)RAND_MAX+1);
-	while (negative_target_class == target_class) {
-		negative_target_class = (unsigned int)CLASSES * 1.0*rand()/((unsigned int)RAND_MAX+1);
-	}
+	unsigned int negative_target_class;
+	do {
+		negative_target_class = (unsigned int)(tm_rng_uniform01() * (double)CLASSES);
+		if (negative_target_class >= (unsigned int)CLASSES) {	// Clamp, because float scaling
+			negative_target_class = (unsigned int)CLASSES - 1U; 
+		}
+	} while (negative_target_class == (unsigned int)target_class);
 
 	tm_update(mc_tm->tsetlin_machines[negative_target_class], Xi, 0, s);
 }
diff --git a/NoisyParityDemo.c b/NoisyParityDemo.c
index 9c7201b..3f6d13b 100644
--- a/NoisyParityDemo.c
+++ b/NoisyParityDemo.c
@@ -2,6 +2,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 #include <time.h>
 #include <string.h>
 
@@ -39,7 +40,8 @@ void read_file(void)
 		}
 		y_train[i] = atoi(token);
 	}
-
+	fclose(fp);
+	
 	fp = fopen("NoisyParityTestingData.txt", "r");
 	if (fp == NULL) {
 		printf("Error opening\n");
@@ -58,12 +60,15 @@ void read_file(void)
 		}
 		y_test[i] = atoi(token);
 	}
+	fclose(fp);
+	free(line);
 }
 
 
 int main(void)
 {	
 	srand(time(NULL));
+	tm_rng_seed((uint64_t)time(NULL) ^ ((uint64_t)clock() << 32));
 
 	read_file();
 
diff --git a/TsetlinMachine.c b/TsetlinMachine.c
index 6435a9a..82cad30 100644
--- a/TsetlinMachine.c
+++ b/TsetlinMachine.c
@@ -56,7 +56,7 @@ void tm_initialize(struct TsetlinMachine *tm)
 					for (int m = 0; m < LEAF_ALTERNATIVES; m++) {
 						for (int n = 0; n < LEAF_FACTORS; n++) {
 
-							if (1.0 * rand()/RAND_MAX <= 0.5) {
+							if (tm_rng_uniform01() <= 0.5) {
 								(*tm).ta_state[i][j][k][l][m][n] = NUMBER_OF_STATES;
 								(*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] = NUMBER_OF_STATES + 1;
 							} else {
@@ -80,7 +80,57 @@ static inline int action(int state)
 /* Calculate the output of each clause using the actions of each Tsetline Automaton. */
 /* Output is stored an internal output array. */
 
-static inline void calculate_clause_output(struct TsetlinMachine *tm, int Xi[], int predict)
+/* Scoring path: same vote arithmetic as update, without writing clause_component_output etc. */
+static inline void calculate_clause_output_predict(struct TsetlinMachine *tm, int Xi[])
+{
+	int action_include;
+
+	for (int i = 0; i < CLAUSES; i++) {
+		(*tm).clause_output[i] = 1;
+
+		for (int j = 0; j < ROOT_FACTORS; j++) {
+			int interior_vote_sum_j = 0;
+
+			for (int k = 0; k < INTERIOR_ALTERNATIVES; k++) {
+				int interior_vote_product = 1;
+
+				for (int l = 0; l < INTERIOR_FACTORS; l++) {
+					int leaf_vote_sum = 0;
+
+					for (int m = 0; m < LEAF_ALTERNATIVES; m++) {
+						int clause_component_output = 1;
+
+						for (int n = 0; n < LEAF_FACTORS; n++) {
+							int feature = j * INTERIOR_FACTORS * LEAF_FACTORS + l * LEAF_FACTORS + n;
+
+							action_include = action((*tm).ta_state[i][j][k][l][m][n]);
+							if ((action_include == 1 && Xi[feature] == 0)) {
+								clause_component_output = 0;
+								break;
+							}
+
+							action_include = action((*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS]);
+							if ((action_include == 1 && Xi[feature + FEATURES] == 0)) {
+								clause_component_output = 0;
+								break;
+							}
+						}
+
+						leaf_vote_sum += clause_component_output;
+					}
+
+					interior_vote_product *= leaf_vote_sum;
+				}
+
+				interior_vote_sum_j += interior_vote_product;
+			}
+
+			(*tm).clause_output[i] *= interior_vote_sum_j;
+		}
+	}
+}
+
+static inline void calculate_clause_output_update(struct TsetlinMachine *tm, int Xi[])
 {
 	int action_include;
 
@@ -138,6 +188,15 @@ static inline void calculate_clause_output(struct TsetlinMachine *tm, int Xi[],
 	}
 }
 
+static inline void calculate_clause_output(struct TsetlinMachine *tm, int Xi[], int predict)
+{
+	if (predict == PREDICT) {
+		calculate_clause_output_predict(tm, Xi);
+	} else {
+		calculate_clause_output_update(tm, Xi);
+	}
+}
+
 /* Sum up the votes for each class (this is the multiclass version of the Tsetlin Machine) */
 static inline int sum_up_class_votes(struct TsetlinMachine *tm)
 {
@@ -167,24 +226,24 @@ static inline void type_i_feedback(struct TsetlinMachine *tm, int Xi[], int i, i
 {
 	if ((*tm).clause_output[i] == 0 || (*tm).interior_vote_products[i][j][k] == 0 || (*tm).clause_component_output[i][j][k][l][m] == 0)	{
 		for (int n = 0; n < LEAF_FACTORS; n++) {
-			(*tm).ta_state[i][j][k][l][m][n] -= ((*tm).ta_state[i][j][k][l][m][n] > 1) && (1.0*rand()/RAND_MAX <= 1.0/s);	
+			(*tm).ta_state[i][j][k][l][m][n] -= ((*tm).ta_state[i][j][k][l][m][n] > 1) && (tm_rng_uniform01() <= 1.0/s);	
 
-			(*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] -= ((*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] > 1) && (1.0*rand()/RAND_MAX <= 1.0/s);				
+			(*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] -= ((*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] > 1) && (tm_rng_uniform01() <= 1.0/s);				
 		}
 	} else {
 		int feature_index = j * INTERIOR_FACTORS * LEAF_FACTORS + l * LEAF_FACTORS;
 
 		for (int n = 0; n < LEAF_FACTORS; n++) {
 			if (Xi[feature_index + n] == 1) {
-				(*tm).ta_state[i][j][k][l][m][n] += ((*tm).ta_state[i][j][k][l][m][n] < NUMBER_OF_STATES*2) && (BOOST_TRUE_POSITIVE_FEEDBACK == 1 || 1.0*rand()/RAND_MAX <= (s-1)/s);
+				(*tm).ta_state[i][j][k][l][m][n] += ((*tm).ta_state[i][j][k][l][m][n] < NUMBER_OF_STATES*2) && (BOOST_TRUE_POSITIVE_FEEDBACK == 1 || tm_rng_uniform01() <= (s-1)/s);
 			} else {				
-				(*tm).ta_state[i][j][k][l][m][n] -= ((*tm).ta_state[i][j][k][l][m][n] > 1) && (1.0*rand()/RAND_MAX <= 1.0/s);
+				(*tm).ta_state[i][j][k][l][m][n] -= ((*tm).ta_state[i][j][k][l][m][n] > 1) && (tm_rng_uniform01() <= 1.0/s);
 			}
 
 			if (Xi[feature_index + n + FEATURES] == 1) {
-				(*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] += ((*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] < NUMBER_OF_STATES*2) && (BOOST_TRUE_POSITIVE_FEEDBACK == 1 || 1.0*rand()/RAND_MAX <= (s-1)/s);
+				(*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] += ((*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] < NUMBER_OF_STATES*2) && (BOOST_TRUE_POSITIVE_FEEDBACK == 1 || tm_rng_uniform01() <= (s-1)/s);
 			} else {				
-				(*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] -= ((*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] > 1) && (1.0*rand()/RAND_MAX <= 1.0/s);
+				(*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] -= ((*tm).ta_state[i][j][k][l][m][n + LEAF_FACTORS] > 1) && (tm_rng_uniform01() <= 1.0/s);
 			}
 		}
 	}
@@ -244,7 +303,7 @@ void tm_update(struct TsetlinMachine *tm, int Xi[], int target, float s) {
 			for (int k = 0; k < INTERIOR_ALTERNATIVES; k++) {
 				for (int l = 0; l < INTERIOR_FACTORS; l++) {
 					for (int m = 0; m < LEAF_ALTERNATIVES; m++) {
-						(*tm).feedback_to_components[i][j][k][l][m] = sign*(2*target-1)*(1.0*rand()/RAND_MAX <= (1.0/(THRESHOLD*2))*(THRESHOLD + (1 - 2*target)*class_sum));
+						(*tm).feedback_to_components[i][j][k][l][m] = sign*(2*target-1)*(tm_rng_uniform01() <= (1.0/(THRESHOLD*2))*(THRESHOLD + (1 - 2*target)*class_sum));
 					}
 				}
 			}
@@ -286,4 +345,3 @@ int tm_score(struct TsetlinMachine *tm, int Xi[]) {
 	return sum_up_class_votes(tm);
 }
 
-
diff --git a/TsetlinMachine.h b/TsetlinMachine.h
index c2689e4..37aa311 100644
--- a/TsetlinMachine.h
+++ b/TsetlinMachine.h
@@ -44,6 +44,12 @@ This code implements the Tsetlin Machine from paper arXiv:1804.01508
 #define PREDICT 1
 #define UPDATE 0
 
+#include <stdint.h>
+
+void tm_rng_seed(uint64_t seed);
+double tm_rng_uniform01(void);
+
+
 struct TsetlinMachine {
 	int ta_state[CLAUSES][ROOT_FACTORS][INTERIOR_ALTERNATIVES][INTERIOR_FACTORS][LEAF_ALTERNATIVES][LITERALS_PER_GROUP]; // The clause components, unique per clause (later we can introduce sharing)
 	int leaf_vote_sum[CLAUSES][ROOT_FACTORS][INTERIOR_ALTERNATIVES][INTERIOR_FACTORS]; // Stores how many class votes you get per feature group (vote summation over leaf alternatives)
diff --git a/makefile b/makefile
index 6d50205..34bc827 100644
--- a/makefile
+++ b/makefile
@@ -1,5 +1,11 @@
-NoisyParityDemo: MultiClassTsetlinMachine.c MultiClassTsetlinMachine.h TsetlinMachine.c TsetlinMachine.h NoisyParityDemo.c
-	gcc -Wall -O3 -ffast-math -o NoisyParityDemo NoisyParityDemo.c MultiClassTsetlinMachine.c TsetlinMachine.c 
+SOURCES = NoisyParityDemo.c MultiClassTsetlinMachine.c TsetlinMachine.c tm_random.c
+HEADERS = MultiClassTsetlinMachine.h TsetlinMachine.h
+
+CC ?= gcc
+CFLAGS_COMMON = -Wall -O3 -ffast-math -march=native -flto
+
+NoisyParityDemo: $(SOURCES) $(HEADERS)
+	$(CC) $(CFLAGS_COMMON) -o NoisyParityDemo $(SOURCES)
 
 clean:
-	rm *.o NoisyParityDemo
+	rm -f NoisyParityDemo
diff --git a/tm_random.c b/tm_random.c
new file mode 100644
index 0000000..d8526db
--- /dev/null
+++ b/tm_random.c
@@ -0,0 +1,56 @@
+/*
+ * Fast RNG for the hierarchical TM demo.
+ *
+ * The PCG32 implementation is taken from CAIR TMU (same file layout as
+ * tmu/lib/src/random/pcg32_fast.c in https://github.com/cair/tmu).
+ *
+ * XOR-shift 128+ is also from that tree (tmu/lib/src/random/xorshift128.c).
+ * TMU's Python ClauseBank seeds both generators; we match that. Timed runs
+ * on this workload were slightly faster with PCG32 than with 128+ alone,
+ * so tm_rng_uniform01() uses PCG32 only.
+ */
+
+#include <stdint.h>
+
+#include "TsetlinMachine.h"
+
+/* ---- PCG32 (from TMU) ---- */
+
+static uint64_t const pcg32_multiplier = 6364136223846793005u;
+static uint64_t pcg32_state = 0xcafef00dd15ea5e5u;
+
+static void pcg32_seed_local(uint64_t seed)
+{
+	pcg32_state = seed;
+}
+
+static uint32_t pcg32_draw(void)
+{
+	uint64_t x = pcg32_state;
+	unsigned int count = (unsigned int)(x >> 61);
+	pcg32_state = x * pcg32_multiplier;
+	return (uint32_t)((x ^ x >> 22) >> (22 + count));
+}
+
+/* ---- xorshift128+ (from TMU; seeded for parity, not used for floats) ---- */
+
+static uint64_t xs128_state[2] = {0xcafef00dbadc0ffeULL, 0xdeadbeef12345678ULL};
+
+static void xorshift128p_seed_local(uint64_t seed)
+{
+	xs128_state[0] = seed;
+	xs128_state[1] = ~seed;
+}
+
+void tm_rng_seed(uint64_t seed)
+{
+	uint64_t s = seed ? seed : 1ULL;
+
+	pcg32_seed_local(s);
+	xorshift128p_seed_local(s);
+}
+
+double tm_rng_uniform01(void)
+{
+	return (double)pcg32_draw() / (double)UINT32_MAX;
+}