Spaces:

Yassine
/

Stego

Sleeping

App Files Files Community

YassineYousfi commited on Oct 26, 2021

Commit

06242ba

1 Parent(s): 0b1e65b

dump

Browse files

Files changed (18) hide show

Makefile +9 -0
README copy.md +2 -0
app.py +7 -0
common.cpp +177 -0
common.h +35 -0
example.py +36 -0
lib/stc.so +0 -0
requirements.txt +1 -0
sse_mathfun.h +762 -0
stc.py +192 -0
stc_embed_c.cpp +476 -0
stc_embed_c.h +22 -0
stc_extract_c.cpp +101 -0
stc_extract_c.h +19 -0
stc_interface.cpp +48 -0
stc_interface.h +13 -0
stc_ml_c.cpp +932 -0
stc_ml_c.h +64 -0

Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+SRC = stc_interface.cpp stc_embed_c.cpp stc_extract_c.cpp common.cpp stc_ml_c.cpp
+OBJ= stc_interface.o stc_embed_c.o stc_extract_c.o common.o stc_ml_c.o
+default:
+	g++ -std=c++98 -fPIC -O3 -c $(SRC)
+	g++ -shared -o lib/stc.so $(OBJ)
+	rm -f *.o
+clean:
+	rm -f *.o *.pyc

README copy.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # pySTC
2	+ A Python interface for [Syndrome Trellis Codes](http://dde.binghamton.edu/download/syndrome/) Steganography

app.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import gradio as gr
+def greet(name):
+    return "Hello " + name + "!!"
+iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+iface.launch()

common.cpp ADDED Viewed

	@@ -0,0 +1,177 @@

+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include "common.h"
+#include <boost/random/uniform_int.hpp>
+#include <boost/random/variate_generator.hpp>
+#include <boost/random/mersenne_twister.hpp>
+u32 mats[] = {
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+109, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+109, 79, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+89, 127, 99, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+95, 75, 121, 71, 109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+71, 117, 127, 75, 89, 109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+111, 83, 127, 97, 77, 117, 89, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+113, 111, 87, 93, 99, 73, 117, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+89, 97, 115, 81, 77, 117, 87, 127, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+95, 107, 109, 79, 117, 67, 121, 123, 103, 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+117, 71, 109, 79, 101, 115, 123, 81, 77, 95, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+119, 73, 81, 125, 123, 103, 99, 127, 109, 69, 89, 107, 0, 0, 0, 0, 0, 0, 0, 0,
+87, 127, 117, 81, 97, 67, 101, 93, 105, 109, 75, 115, 123, 0, 0, 0, 0, 0, 0, 0,
+93, 107, 115, 95, 121, 81, 75, 99, 111, 85, 79, 119, 105, 65, 0, 0, 0, 0, 0, 0,
+123, 85, 79, 87, 127, 65, 115, 93, 101, 111, 73, 119, 105, 99, 91, 0, 0, 0, 0, 0,
+127, 99, 121, 111, 71, 109, 103, 117, 113, 65, 105, 87, 101, 75, 93, 123, 0, 0, 0, 0,
+89, 93, 111, 117, 103, 127, 77, 95, 85, 105, 67, 69, 113, 123, 99, 75, 119, 0, 0, 0,
+65, 99, 77, 85, 101, 91, 125, 103, 127, 111, 69, 93, 75, 95, 119, 113, 105, 115, 0, 0,
+91, 117, 77, 107, 101, 127, 115, 83, 85, 119, 105, 113, 93, 71, 111, 121, 97, 73, 81, 0,
+95, 111, 117, 83, 97, 75, 87, 127, 85, 93, 105, 115, 77, 101, 99, 89, 71, 121, 67, 123,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+247, 149, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+143, 187, 233, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+235, 141, 161, 207, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+219, 185, 151, 255, 197, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+251, 159, 217, 167, 221, 133, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+201, 143, 231, 251, 189, 169, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+143, 245, 177, 253, 217, 163, 155, 197, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+233, 145, 219, 185, 231, 215, 173, 129, 243, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+139, 201, 177, 167, 213, 253, 227, 199, 185, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+183, 145, 223, 199, 245, 139, 187, 157, 217, 237, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+223, 145, 137, 219, 197, 243, 247, 189, 135, 181, 207, 235, 0, 0, 0, 0, 0, 0, 0, 0,
+229, 205, 237, 187, 135, 241, 183, 163, 151, 243, 213, 137, 159, 0, 0, 0, 0, 0, 0, 0,
+205, 165, 239, 211, 231, 247, 133, 227, 219, 189, 249, 185, 149, 129, 0, 0, 0, 0, 0, 0,
+131, 213, 255, 207, 227, 221, 173, 185, 197, 147, 235, 247, 217, 143, 229, 0, 0, 0, 0, 0,
+247, 139, 157, 223, 187, 147, 177, 249, 165, 153, 161, 227, 237, 255, 207, 197, 0, 0, 0, 0,
+205, 139, 239, 183, 147, 187, 249, 225, 253, 163, 173, 233, 209, 159, 255, 149, 197, 0, 0, 0,
+177, 173, 195, 137, 211, 249, 191, 135, 175, 155, 229, 215, 203, 225, 247, 237, 221, 227, 0, 0,
+159, 189, 195, 163, 255, 147, 219, 247, 231, 157, 139, 173, 185, 197, 207, 245, 193, 241, 233, 0,
+235, 179, 219, 253, 241, 131, 213, 231, 247, 223, 201, 193, 191, 249, 145, 237, 155, 165, 141, 173,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+339, 489, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+469, 441, 379, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+371, 439, 277, 479, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+413, 489, 443, 327, 357, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+509, 453, 363, 409, 425, 303, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+377, 337, 443, 487, 467, 421, 299, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+497, 349, 279, 395, 365, 427, 399, 297, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+435, 373, 395, 507, 441, 325, 279, 289, 319, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+301, 379, 509, 411, 293, 467, 455, 261, 343, 447, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+367, 289, 445, 397, 491, 279, 373, 315, 435, 473, 327, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+465, 379, 319, 275, 293, 407, 373, 427, 445, 497, 347, 417, 0, 0, 0, 0, 0, 0, 0, 0,
+473, 401, 267, 311, 359, 347, 333, 441, 405, 381, 497, 463, 269, 0, 0, 0, 0, 0, 0, 0,
+467, 283, 405, 303, 269, 337, 385, 441, 511, 361, 455, 355, 353, 311, 0, 0, 0, 0, 0, 0,
+489, 311, 259, 287, 445, 471, 419, 345, 289, 391, 405, 411, 371, 457, 331, 0, 0, 0, 0, 0,
+493, 427, 305, 309, 339, 447, 381, 335, 323, 423, 453, 457, 443, 313, 371, 353, 0, 0, 0, 0,
+271, 301, 483, 401, 369, 367, 435, 329, 319, 473, 441, 491, 325, 455, 389, 341, 317, 0, 0, 0,
+333, 311, 509, 319, 391, 441, 279, 467, 263, 487, 393, 405, 473, 303, 353, 337, 451, 365, 0, 0,
+301, 477, 361, 445, 505, 363, 375, 277, 271, 353, 337, 503, 457, 357, 287, 323, 435, 345, 497, 0,
+281, 361, 413, 287, 475, 359, 483, 351, 337, 425, 453, 423, 301, 309, 331, 499, 507, 277, 375, 471,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+519, 885, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+579, 943, 781, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+685, 663, 947, 805, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+959, 729, 679, 609, 843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+959, 973, 793, 747, 573, 659, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+631, 559, 1023, 805, 709, 913, 979, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+607, 867, 731, 1013, 625, 973, 825, 925, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+743, 727, 851, 961, 813, 605, 527, 563, 867, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+863, 921, 943, 523, 653, 969, 563, 597, 753, 621, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+729, 747, 901, 839, 815, 935, 777, 641, 1011, 603, 973, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+581, 831, 659, 877, 781, 929, 1003, 1021, 655, 729, 983, 611, 0, 0, 0, 0, 0, 0, 0, 0,
+873, 1013, 859, 887, 579, 697, 769, 927, 679, 683, 911, 753, 733, 0, 0, 0, 0, 0, 0, 0,
+991, 767, 845, 977, 923, 609, 633, 769, 533, 829, 859, 759, 687, 657, 0, 0, 0, 0, 0, 0,
+781, 663, 731, 829, 851, 941, 601, 997, 719, 675, 947, 939, 657, 549, 647, 0, 0, 0, 0, 0,
+619, 879, 681, 601, 1015, 797, 737, 841, 839, 869, 931, 789, 767, 547, 823, 635, 0, 0, 0, 0,
+855, 567, 591, 1019, 745, 945, 769, 671, 803, 799, 925, 701, 517, 653, 885, 731, 581, 0, 0, 0,
+887, 643, 785, 611, 905, 669, 703, 1017, 575, 763, 625, 869, 731, 861, 847, 941, 933, 577, 0, 0,
+867, 991, 1021, 709, 599, 741, 933, 921, 619, 789, 957, 791, 969, 525, 591, 763, 657, 683, 829, 0,
+1009, 1003, 901, 715, 643, 803, 805, 975, 667, 619, 569, 769, 685, 767, 853, 671, 881, 907, 955, 523,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1655, 1493, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1859, 1481, 1119, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1395, 1737, 1973, 1259, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1339, 1067, 1679, 1641, 2021, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1657, 1331, 1783, 2043, 1097, 1485, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1611, 1141, 1849, 2001, 1511, 1359, 1245, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1215, 1733, 1461, 2025, 1251, 1945, 1649, 1851, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1275, 1373, 1841, 1509, 1631, 1737, 1055, 1891, 1041, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1715, 1117, 1503, 2025, 1027, 1959, 1365, 1739, 1301, 1233, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1101, 1127, 1145, 1157, 1195, 1747, 1885, 1527, 1325, 2033, 1935, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+1369, 1255, 1809, 1889, 1183, 1495, 1223, 1781, 2029, 1327, 1075, 1065, 0, 0, 0, 0, 0, 0, 0, 0,
+1157, 1499, 1871, 1365, 1559, 1149, 1293, 1571, 1641, 1971, 1807, 1673, 2023, 0, 0, 0, 0, 0, 0, 0,
+1929, 1533, 1135, 1359, 1547, 1723, 1529, 1107, 1273, 1879, 1709, 1141, 1897, 1161, 0, 0, 0, 0, 0, 0,
+1861, 1801, 1675, 1699, 1103, 1665, 1657, 1287, 1459, 2047, 1181, 1835, 1085, 1377, 1511, 0, 0, 0, 0, 0,
+1915, 1753, 1945, 1391, 1205, 1867, 1895, 1439, 1719, 1185, 1685, 1139, 1229, 1791, 1821, 1295, 0, 0, 0, 0,
+1193, 1951, 1469, 1737, 1047, 1227, 1989, 1717, 1735, 1643, 1857, 1965, 1405, 1575, 1907, 1173, 1299, 0, 0, 0,
+1641, 1887, 1129, 1357, 1543, 1279, 1687, 1975, 1839, 1775, 1109, 1337, 1081, 1435, 1603, 2037, 1249, 1153, 0, 0,
+1999, 1065, 1387, 1977, 1555, 1915, 1219, 1469, 1889, 1933, 1819, 1315, 1319, 1693, 1143, 1361, 1815, 1109, 1631, 0,
+1253, 1051, 1827, 1871, 1613, 1759, 2015, 1229, 1585, 1057, 1409, 1831, 1943, 1491, 1557, 1195, 1339, 1449, 1675, 1679,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+3475, 2685, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+3865, 2883, 2519, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+4019, 3383, 3029, 2397, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+2725, 3703, 3391, 2235, 2669, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+2489, 3151, 2695, 3353, 4029, 3867, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+2467, 2137, 3047, 3881, 3125, 2683, 3631, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+2739, 3163, 2137, 4031, 2967, 3413, 3749, 2301, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+3443, 2305, 3365, 2231, 2127, 3697, 3535, 4041, 2621, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+3641, 2777, 2789, 2357, 3003, 2729, 3229, 2925, 3443, 2291, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+3567, 2361, 2061, 2219, 3905, 2285, 2871, 3187, 2455, 2783, 2685, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+4043, 2615, 2385, 3911, 3267, 2871, 3667, 3037, 2905, 2921, 2129, 2299, 0, 0, 0, 0, 0, 0, 0, 0,
+2315, 2997, 3743, 2729, 3117, 2297, 2585, 3141, 3283, 3943, 3613, 3345, 4047, 0, 0, 0, 0, 0, 0, 0,
+3967, 3069, 3377, 3909, 3691, 2439, 2533, 3075, 2129, 3319, 3433, 3035, 2745, 2631, 0, 0, 0, 0, 0, 0,
+3023, 3349, 2111, 2385, 3907, 3959, 3425, 3801, 2135, 2671, 2637, 2977, 2999, 3107, 2277, 0, 0, 0, 0, 0,
+2713, 2695, 3447, 2537, 2685, 3755, 3953, 3901, 3193, 3107, 2407, 3485, 2097, 3091, 2139, 2261, 0, 0, 0, 0,
+3065, 4059, 2813, 3043, 2849, 3477, 3205, 3381, 2747, 3203, 3937, 3603, 3625, 3559, 3831, 2243, 2343, 0, 0, 0,
+3999, 3183, 2717, 2307, 2103, 3353, 2761, 2541, 2375, 2327, 3277, 2607, 3867, 3037, 2163, 2261, 3649, 2929, 0, 0,
+2543, 2415, 3867, 3709, 3161, 2369, 4087, 2205, 3785, 2515, 2133, 2913, 3941, 3371, 2605, 3269, 3385, 3025, 2323, 0,
+2939, 2775, 3663, 2413, 2573, 2205, 3821, 3513, 2699, 3379, 2479, 2663, 2367, 2517, 3027, 3201, 3177, 3281, 4069, 2069,
+};
+u32 *getMatrix(int width, int height) {
+	u32 *cols;
+	cols = (u32*)malloc(width * sizeof(u32));
+	if(width >= 2 && width <= 20 && height >= 7 && height <= 12) { // get it from the array
+		memcpy(cols, &mats[(height - 7) * 400 + (width - 1) * 20], width * sizeof(u32));
+	} else { // generate a random one
+		int i, j;
+		u32 r, mask, bop;
+		/* This was here because random submatrices designed with the same columns are known to be bad. But sometimes the
+		 * payload is so small that there is no other way.
+		 *
+		 * Modified by Tomas Filler.
+		 */
+		boost::mt19937 generator( 1 );
+		boost::variate_generator< boost::mt19937&, boost::uniform_int< > > rng( generator, boost::uniform_int< >( 0, RAND_MAX ) );
+        mask = (1 << (height - 2)) - 1;
+        bop = (1 << (height - 1)) + 1;
+        if((1 << (height - 2)) < width) {
+			// fprintf(stderr, "Cannot generate matrix for this payload. Choose a higher constraint height.\n");
+            // generate the columns randomly but let first and last row be full of 1s.
+            // I know, there will be identical columns.
+            for(i = 0; i < width; i++) {
+                r = ((rng() & mask) << 1) + bop;
+                cols[i] = r;
+            }
+		} else {
+            for(i = 0; i < width; i++) {
+                for(j = -1; j < i;) {
+                    r = ((rng() & mask) << 1) + bop;
+                    for(j = 0; j < i; j++) {
+                        if(cols[j] == r)
+                            break;
+                    }
+                }
+                cols[i] = r;
+            }
+		}
+	}
+	return cols;
+}

common.h ADDED Viewed

	@@ -0,0 +1,35 @@

+#ifndef COMMON_H
+#define COMMON_H
+#include <string>
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+extern u32 mats[];
+/* Simple class for throwing exceptions */
+class stc_exception : public std::exception {
+public:
+    stc_exception(std::string message, u32 error_id) { this->message = message; this->error_id = error_id; }
+    virtual ~stc_exception() throw() {}
+    virtual const char* what() const throw() { return message.c_str(); }
+    u32 error_id;
+private:
+    std::string message;
+};
+/*
+   The following error_ids are in use:
+   1 = Submatrix height must not exceed 31.
+   2 = Not enough memory.
+   3 = The message cannot be longer than the cover object.
+   4 = No solution exists.                                 - This happen when there are too many Inf values in cost vector and thus the solution does not exist due to sparse parity-check matrix.
+   5 = Price vector limit exceeded.                        - There is a limit to cost elements when you use integer version of the algorithm. Try to use costs in double.
+   6 = Maximum number of trials in layered construction exceeded.
+ */
+u32 *getMatrix(int width, int height);
+#endif

example.py ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/usr/bin/env python3
+import stc
+import numpy as np
+import imageio
+from scipy import signal
+input_image = 'files/1.pgm'
+def HILL(input_image):
+    H = np.array(
+       [[-1,  2, -1],
+        [ 2, -4,  2],
+        [-1,  2, -1]])
+    L1 = np.ones((3, 3)).astype('float32')/(3**2)
+    L2 = np.ones((15, 15)).astype('float32')/(15**2)
+    I = imageio.imread(input_image)
+    costs = signal.convolve2d(I, H, mode='same')
+    costs = abs(costs)
+    costs = signal.convolve2d(costs, L1, mode='same')
+    costs = 1/costs
+    costs = signal.convolve2d(costs, L2, mode='same')
+    costs[costs == np.inf] = 1
+    return costs
+costs = HILL(input_image)
+print(costs)
+stc.embed(input_image, costs, 'files/message.txt', 's3cr3t', 'files/stego.png')
+stc.extract('files/stego.png', 's3cr3t', 'files/output.txt')
+print(open('files/output.txt', 'r').read())

lib/stc.so ADDED Viewed

Binary file (75.8 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pycryptodome

sse_mathfun.h ADDED Viewed

	@@ -0,0 +1,762 @@

+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+/* Copyright (C) 2007  Julien Pommier
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+  (this is the zlib license)
+*/
+#include <xmmintrin.h>
+/* yes I know, the top of this file is quite ugly */
+#define USE_SSE2 // use SSE2 version
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+#if defined (__MINGW32__)
+/* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
+   The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
+   broken on my mingw gcc 3.4.5 ...
+   Note that the bug on _mm_cmp* does occur only at -O0 optimization level
+*/
+inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
+	asm (
+			"movhlps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;                                 }
+#warning "redefined _mm_movehl_ps (see gcc bug 21179)"
+#define _mm_movehl_ps my_movehl_ps
+inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpltps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;
+                  }
+inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpnleps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;
+}
+inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpeqps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;
+}
+#warning "redefined _mm_cmpxx_ps functions..."
+#define _mm_cmplt_ps my_cmplt_ps
+#define _mm_cmpgt_ps my_cmpgt_ps
+#define _mm_cmpeq_ps my_cmpeq_ps
+#endif
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+#endif // USE_SSE2
+/* natural logarithm computed for 4 simultaneous float
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+  e = _mm_add_ps(e, one);
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+  v4sf z = _mm_mul_ps(x,x);
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+  y = _mm_mul_ps(y, z);
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+  z = _mm_mul_ps(x,x);
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23);
+  mm1 = _mm_slli_pi32(mm1, 23);
+  v4sf pow2n;
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  //printf("plop:"); print4(y);
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+  /* get the swap sign flag in mm0:mm1 and the
+     polynom selection mask in mm2:mm3 */
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+  emm4 = emm2;
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  mm4 = mm2;
+  mm5 = mm3;
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+  /* get the polynom selection mask for the sine */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}

stc.py ADDED Viewed

	@@ -0,0 +1,192 @@

+#!/usr/bin/env python3
+import sys
+import os.path
+import math
+import random
+import struct
+import hashlib
+from PIL import Image
+from ctypes import *
+from Crypto.Cipher import AES
+from Crypto.Random import get_random_bytes
+from Crypto.Util.Padding import pad, unpad
+def prepare_message(filename, password):
+    f = open(filename, 'r')
+    content_data = f.read().encode('utf-8')
+    # Prepare a header with basic data about the message
+    content_ver=struct.pack("B", 1) # version 1
+    content_len=struct.pack("!I", len(content_data))
+    content=content_ver+content_len+content_data
+    # encrypt
+    enc = encrypt(content, password)
+    array=[]
+    for b in enc:
+        for i in range(8):
+            array.append((b >> i) & 1)
+    return array
+# {{{ encrypt()
+def encrypt(plain_text, password):
+    salt = get_random_bytes(AES.block_size)
+    # use the Scrypt KDF to get a private key from the password
+    private_key = hashlib.scrypt(
+        password.encode(), salt=salt, n=2**14, r=8, p=1, dklen=32)
+    cipher = AES.new(private_key, AES.MODE_CBC)
+    cipher_text = cipher.encrypt(pad(plain_text, AES.block_size))
+    enc = salt+cipher.iv+cipher_text
+    return enc
+# }}}
+# {{{ decrypt()
+def decrypt(cipher_text, password):
+    salt = cipher_text[:AES.block_size]
+    iv = cipher_text[AES.block_size:AES.block_size*2]
+    cipher_text = cipher_text[AES.block_size*2:]
+    # Fix padding
+    mxlen = len(cipher_text)-(len(cipher_text)%AES.block_size)
+    cipher_text = cipher_text[:mxlen]
+    private_key = hashlib.scrypt(
+        password.encode(), salt=salt, n=2**14, r=8, p=1, dklen=32)
+    cipher = AES.new(private_key, AES.MODE_CBC, iv=iv)
+    decrypted = cipher.decrypt(cipher_text)
+    #decrypted = unpad(decrypted, AES.block_size)
+    return decrypted
+# }}}
+def embed(input_img_path, cost_matrix,  msg_file_path, password, output_img_path, payload=0.40):
+    me = os.path.abspath(os.path.dirname(__file__))
+    lib = cdll.LoadLibrary(os.path.join(me, "lib", "stc.so"))
+    # Prepare cover image
+    im=Image.open(input_img_path)
+    if im.mode in ['L']:
+        width, height = im.size
+    if im.mode in ['RGB', 'RGBA', 'RGBX']:
+        pass
+    I = im.load()
+    cover = (c_int*(width*height))()
+    idx=0
+    for j in range(height):
+        for i in range(width):
+            cover[idx] = I[i, j]
+            idx += 1
+    # Prepare costs
+    INF = 2**31-1
+    costs = (c_float*(width*height*3))()
+    idx=0
+    for j in range(height):
+        for i in range(width):
+            if cover[idx]==0:
+                costs[3*idx+0] = INF
+                costs[3*idx+1] = 0
+                costs[3*idx+2] = cost_matrix[i, j]
+            elif cover[idx]==255:
+                costs[3*idx+0] = cost_matrix[i, j]
+                costs[3*idx+1] = 0
+                costs[3*idx+2] = INF
+            else:
+                costs[3*idx+0] = cost_matrix[i, j]
+                costs[3*idx+1] = 0
+                costs[3*idx+2] = cost_matrix[i, j]
+            idx += 1
+    # Prepare message
+    msg_bits = prepare_message(msg_file_path, password)
+    if len(msg_bits)>width*height*payload:
+        print("Message too long")
+        sys.exit(0)
+    m = int(width*height*payload)
+    message = (c_ubyte*m)()
+    for i in range(m):
+        if i<len(msg_bits):
+            message[i] = msg_bits[i]
+        else:
+            message[i] = 0
+    # Hide message
+    stego = (c_int*(width*height))()
+    a = lib.stc_hide(width*height, cover, costs, m, message, stego)
+    # Save output message
+    idx=0
+    for j in range(height):
+        for i in range(width):
+            im.putpixel((i, j), stego[idx])
+            idx += 1
+    im.save(output_img_path)
+    im.close()
+def extract(stego_img_path, password, output_msg_path, payload=0.40):
+    me = os.path.abspath(os.path.dirname(__file__))
+    lib = cdll.LoadLibrary(os.path.join(me, "lib", "stc.so"))
+    # Prepare stego image
+    im=Image.open(stego_img_path)
+    if im.mode in ['L']:
+        width, height = im.size
+    if im.mode in ['RGB', 'RGBA', 'RGBX']:
+        pass
+    I = im.load()
+    stego = (c_int*(width*height))()
+    idx=0
+    for j in range(height):
+        for i in range(width):
+            stego[idx] = I[i, j]
+            idx += 1
+    # Extract the message
+    n = width*height;
+    m = int(n*payload)
+    extracted_message = (c_ubyte*m)()
+    s = lib.stc_unhide(n, stego, m, extracted_message)
+    # Save the message
+    enc = bytearray()
+    idx=0
+    bitidx=0
+    bitval=0
+    for b in extracted_message:
+        if bitidx==8:
+            enc.append(bitval)
+            bitidx=0
+            bitval=0
+        bitval |= b<<bitidx
+        bitidx+=1
+    if bitidx==8:
+        enc.append(bitval)
+    # decrypt
+    cleartext = decrypt(enc, password)
+    # Extract the header and the message
+    content_ver=struct.unpack_from("B", cleartext, 0)
+    content_len=struct.unpack_from("!I", cleartext, 1)
+    content=cleartext[5:content_len[0]+5]
+    f = open(output_msg_path, 'w')
+    f.write(content.decode())
+    f.close()

stc_embed_c.cpp ADDED Viewed

	@@ -0,0 +1,476 @@

+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <cfloat>
+#include <limits>
+#include <emmintrin.h>
+#include <cstdio>
+#include <sstream>
+#include "stc_embed_c.h"
+// {{{ aligned_malloc()
+void *aligned_malloc( unsigned int bytes, int align ) {
+    int shift;
+    char *temp = (char *) malloc( bytes + align );
+    if ( temp == NULL ) return temp;
+    shift = align - (int) (((unsigned long long) temp) & (align - 1));
+    temp = temp + shift;
+    temp[-1] = shift;
+    return (void *) temp;
+}
+// }}}
+// {{{ aligned_free()
+void aligned_free( void *vptr ) {
+    char *ptr = (char *) vptr;
+    free( ptr - ptr[-1] );
+    return;
+}
+// }}}
+// {{{ maxLessThan255()
+inline __m128i maxLessThan255( const __m128i v1, const __m128i v2 ) {
+    register __m128i mask = _mm_set1_epi32( 0xffffffff );
+    return _mm_max_epu8( _mm_andnot_si128( _mm_cmpeq_epi8( v1, mask ), v1 ), _mm_andnot_si128( _mm_cmpeq_epi8( v2, mask ), v2 ) );
+}
+// }}}
+// {{{ max16B()
+inline u8 max16B( __m128i maxp ) {
+    u8 mtemp[4];
+    maxp = _mm_max_epu8( maxp, _mm_srli_si128(maxp, 8) );
+    maxp = _mm_max_epu8( maxp, _mm_srli_si128(maxp, 4) );
+    *((int*) mtemp) = _mm_cvtsi128_si32( maxp );
+    if ( mtemp[2] > mtemp[0] ) mtemp[0] = mtemp[2];
+    if ( mtemp[3] > mtemp[1] ) mtemp[1] = mtemp[3];
+    if ( mtemp[1] > mtemp[0] ) return mtemp[1];
+    else return mtemp[0];
+}
+// }}}
+// {{{ min16B()
+inline u8 min16B( __m128i minp ) {
+    u8 mtemp[4];
+    minp = _mm_min_epu8( minp, _mm_srli_si128(minp, 8) );
+    minp = _mm_min_epu8( minp, _mm_srli_si128(minp, 4) );
+    *((int*) mtemp) = _mm_cvtsi128_si32( minp );
+    if ( mtemp[2] < mtemp[0] ) mtemp[0] = mtemp[2];
+    if ( mtemp[3] < mtemp[1] ) mtemp[1] = mtemp[3];
+    if ( mtemp[1] < mtemp[0] ) return mtemp[1];
+    else return mtemp[0];
+}
+// }}}
+// {{{ stc_embed()
+double stc_embed( const u8 *vector, int vectorlength, const u8 *syndrome, int syndromelength, const void *pricevectorv, bool usefloat,
+        u8 *stego, int matrixheight ) {
+    int height, i, k, l, index, index2, parts, m, sseheight, altm, pathindex;
+    u32 column, colmask, state;
+    double totalprice;
+    u8 *ssedone;
+    u32 *path, *columns[2];
+    int *matrices, *widths;
+    if ( matrixheight > 31 ) throw stc_exception( "Submatrix height must not exceed 31.", 1 );
+    height = 1 << matrixheight;
+    colmask = height - 1;
+    height = (height + 31) & (~31);
+    parts = height >> 5;
+    if ( stego != NULL ) {
+        path = (u32*) malloc( vectorlength * parts * sizeof(u32) );
+        if ( path == NULL ) {
+            std::stringstream ss;
+            ss << "Not enough memory (" << (unsigned int) (vectorlength * parts * sizeof(u32)) << " byte array could not be allocated).";
+            throw stc_exception( ss.str(), 2 );
+        }
+        pathindex = 0;
+    }
+    {
+        int shorter, longer, worm;
+        double invalpha;
+        matrices = (int *) malloc( syndromelength * sizeof(int) );
+        widths = (int *) malloc( syndromelength * sizeof(int) );
+        invalpha = (double) vectorlength / syndromelength;
+        if ( invalpha < 1 ) {
+            free( matrices );
+            free( widths );
+            if ( stego != NULL ) free( path );
+            throw stc_exception( "The message cannot be longer than the cover object.", 3 );
+        }
+        /* THIS IS OBSOLETE. Algorithm still works for alpha >1/2. You need to take care of cases with too many Infs in cost vector.
+         if(invalpha < 2) {
+         printf("The relative payload is greater than 1/2. This may result in poor embedding efficiency.\n");
+         }
+         */
+        shorter = (int) floor( invalpha );
+        longer = (int) ceil( invalpha );
+        if ( (columns[0] = getMatrix( shorter, matrixheight )) == NULL ) {
+            free( matrices );
+            free( widths );
+            if ( stego != NULL ) free( path );
+            return -1;
+        }
+        if ( (columns[1] = getMatrix( longer, matrixheight )) == NULL ) {
+            free( columns[0] );
+            free( matrices );
+            free( widths );
+            if ( stego != NULL ) free( path );
+            return -1;
+        }
+        worm = 0;
+        for ( i = 0; i < syndromelength; i++ ) {
+            if ( worm + longer <= (i + 1) * invalpha + 0.5 ) {
+                matrices[i] = 1;
+                widths[i] = longer;
+                worm += longer;
+            } else {
+                matrices[i] = 0;
+                widths[i] = shorter;
+                worm += shorter;
+            }
+        }
+    }
+    if ( usefloat ) {
+        /*
+         SSE FLOAT VERSION
+         */
+        int pathindex8 = 0;
+        int shift[2] = { 0, 4 };
+        u8 mask[2] = { 0xf0, 0x0f };
+        float *prices;
+        u8 *path8 = (u8*) path;
+        double *pricevector = (double*) pricevectorv;
+        double total = 0;
+        float inf = std::numeric_limits< float >::infinity();
+        sseheight = height >> 2;
+        ssedone = (u8*) malloc( sseheight * sizeof(u8) );
+        prices = (float*) aligned_malloc( height * sizeof(float), 16 );
+        {
+            __m128 fillval = _mm_set1_ps( inf );
+            for ( i = 0; i < height; i += 4 ) {
+                _mm_store_ps( &prices[i], fillval );
+                ssedone[i >> 2] = 0;
+            }
+        }
+        prices[0] = 0.0f;
+        for ( index = 0, index2 = 0; index2 < syndromelength; index2++ ) {
+            register __m128 c1, c2;
+            for ( k = 0; k < widths[index2]; k++, index++ ) {
+                column = columns[matrices[index2]][k] & colmask;
+                if ( vector[index] == 0 ) {
+                    c1 = _mm_setzero_ps();
+                    c2 = _mm_set1_ps( (float) pricevector[index] );
+                } else {
+                    c1 = _mm_set1_ps( (float) pricevector[index] );
+                    c2 = _mm_setzero_ps();
+                }
+                total += pricevector[index];
+                for ( m = 0; m < sseheight; m++ ) {
+                    if ( !ssedone[m] ) {
+                        register __m128 v1, v2, v3, v4;
+                        altm = (m ^ (column >> 2));
+                        v1 = _mm_load_ps( &prices[m << 2] );
+                        v2 = _mm_load_ps( &prices[altm << 2] );
+                        v3 = v1;
+                        v4 = v2;
+                        ssedone[m] = 1;
+                        ssedone[altm] = 1;
+                        switch ( column & 3 ) {
+                            case 0:
+                                break;
+                            case 1:
+                                v2 = _mm_shuffle_ps(v2, v2, 0xb1);
+                                v3 = _mm_shuffle_ps(v3, v3, 0xb1);
+                                break;
+                            case 2:
+                                v2 = _mm_shuffle_ps(v2, v2, 0x4e);
+                                v3 = _mm_shuffle_ps(v3, v3, 0x4e);
+                                break;
+                            case 3:
+                                v2 = _mm_shuffle_ps(v2, v2, 0x1b);
+                                v3 = _mm_shuffle_ps(v3, v3, 0x1b);
+                                break;
+                        }
+                        v1 = _mm_add_ps( v1, c1 );
+                        v2 = _mm_add_ps( v2, c2 );
+                        v3 = _mm_add_ps( v3, c2 );
+                        v4 = _mm_add_ps( v4, c1 );
+                        v1 = _mm_min_ps( v1, v2 );
+                        v4 = _mm_min_ps( v3, v4 );
+                        _mm_store_ps( &prices[m << 2], v1 );
+                        _mm_store_ps( &prices[altm << 2], v4 );
+                        if ( stego != NULL ) {
+                            v2 = _mm_cmpeq_ps( v1, v2 );
+                            v3 = _mm_cmpeq_ps( v3, v4 );
+                            path8[pathindex8 + (m >> 1)] = (path8[pathindex8 + (m >> 1)] & mask[m & 1]) | (_mm_movemask_ps( v2 ) << shift[m
+                                    & 1]);
+                            path8[pathindex8 + (altm >> 1)] = (path8[pathindex8 + (altm >> 1)] & mask[altm & 1]) | (_mm_movemask_ps( v3 )
+                                    << shift[altm & 1]);
+                        }
+                    }
+                }
+                for ( i = 0; i < sseheight; i++ ) {
+                    ssedone[i] = 0;
+                }
+                pathindex += parts;
+                pathindex8 += parts << 2;
+            }
+            if ( syndrome[index2] == 0 ) {
+                for ( i = 0, l = 0; i < sseheight; i += 2, l += 4 ) {
+                    _mm_store_ps( &prices[l], _mm_shuffle_ps(_mm_load_ps(&prices[i << 2]), _mm_load_ps(&prices[(i + 1) << 2]), 0x88) );
+                }
+            } else {
+                for ( i = 0, l = 0; i < sseheight; i += 2, l += 4 ) {
+                    _mm_store_ps( &prices[l], _mm_shuffle_ps(_mm_load_ps(&prices[i << 2]), _mm_load_ps(&prices[(i + 1) << 2]), 0xdd) );
+                }
+            }
+            if ( syndromelength - index2 <= matrixheight ) colmask >>= 1;
+            {
+                register __m128 fillval = _mm_set1_ps( inf );
+                for ( l >>= 2; l < sseheight; l++ ) {
+                    _mm_store_ps( &prices[l << 2], fillval );
+                }
+            }
+        }
+        totalprice = prices[0];
+        aligned_free( prices );
+        free( ssedone );
+        if ( totalprice >= total ) {
+            free( matrices );
+            free( widths );
+            free( columns[0] );
+            free( columns[1] );
+            if ( stego != NULL ) free( path );
+            throw stc_exception( "No solution exist.", 4 );
+        }
+    } else {
+        /*
+         SSE UINT8 VERSION
+         */
+        int pathindex16 = 0, subprice = 0;
+        u8 maxc = 0, minc = 0;
+        u8 *prices, *pricevector = (u8*) pricevectorv;
+        u16 *path16 = (u16 *) path;
+        __m128i *prices16B;
+        sseheight = height >> 4;
+        ssedone = (u8*) malloc( sseheight * sizeof(u8) );
+        prices = (u8*) aligned_malloc( height * sizeof(u8), 16 );
+        prices16B = (__m128i *) prices;
+        {
+            __m128i napln = _mm_set1_epi32( 0xffffffff );
+            for ( i = 0; i < sseheight; i++ ) {
+                _mm_store_si128( &prices16B[i], napln );
+                ssedone[i] = 0;
+            }
+        }
+        prices[0] = 0;
+        for ( index = 0, index2 = 0; index2 < syndromelength; index2++ ) {
+            register __m128i c1, c2, maxp, minp;
+            if ( (u32) maxc + pricevector[index] >= 254 ) {
+                aligned_free( path );
+                free( ssedone );
+                free( matrices );
+                free( widths );
+                free( columns[0] );
+                free( columns[1] );
+                if ( stego != NULL ) free( path );
+                throw stc_exception( "Price vector limit exceeded.", 5 );
+            }
+            for ( k = 0; k < widths[index2]; k++, index++ ) {
+                column = columns[matrices[index2]][k] & colmask;
+                if ( vector[index] == 0 ) {
+                    c1 = _mm_setzero_si128();
+                    c2 = _mm_set1_epi8( pricevector[index] );
+                } else {
+                    c1 = _mm_set1_epi8( pricevector[index] );
+                    c2 = _mm_setzero_si128();
+                }
+                minp = _mm_set1_epi8( -1 );
+                maxp = _mm_setzero_si128();
+                for ( m = 0; m < sseheight; m++ ) {
+                    if ( !ssedone[m] ) {
+                        register __m128i v1, v2, v3, v4;
+                        altm = (m ^ (column >> 4));
+                        v1 = _mm_load_si128( &prices16B[m] );
+                        v2 = _mm_load_si128( &prices16B[altm] );
+                        v3 = v1;
+                        v4 = v2;
+                        ssedone[m] = 1;
+                        ssedone[altm] = 1;
+                        if ( column & 8 ) {
+                            v2 = _mm_shuffle_epi32(v2, 0x4e);
+                            v3 = _mm_shuffle_epi32(v3, 0x4e);
+                        }
+                        if ( column & 4 ) {
+                            v2 = _mm_shuffle_epi32(v2, 0xb1);
+                            v3 = _mm_shuffle_epi32(v3, 0xb1);
+                        }
+                        if ( column & 2 ) {
+                            v2 = _mm_shufflehi_epi16(v2, 0xb1);
+                            v3 = _mm_shufflehi_epi16(v3, 0xb1);
+                            v2 = _mm_shufflelo_epi16(v2, 0xb1);
+                            v3 = _mm_shufflelo_epi16(v3, 0xb1);
+                        }
+                        if ( column & 1 ) {
+                            v2 = _mm_or_si128( _mm_srli_epi16( v2, 8 ), _mm_slli_epi16( v2, 8 ) );
+                            v3 = _mm_or_si128( _mm_srli_epi16( v3, 8 ), _mm_slli_epi16( v3, 8 ) );
+                        }
+                        v1 = _mm_adds_epu8( v1, c1 );
+                        v2 = _mm_adds_epu8( v2, c2 );
+                        v3 = _mm_adds_epu8( v3, c2 );
+                        v4 = _mm_adds_epu8( v4, c1 );
+                        v1 = _mm_min_epu8( v1, v2 );
+                        v4 = _mm_min_epu8( v3, v4 );
+                        _mm_store_si128( &prices16B[m], v1 );
+                        _mm_store_si128( &prices16B[altm], v4 );
+                        minp = _mm_min_epu8( minp, _mm_min_epu8( v1, v4 ) );
+                        maxp = _mm_max_epu8( maxp, maxLessThan255( v1, v4 ) );
+                        if ( stego != NULL ) {
+                            v2 = _mm_cmpeq_epi8( v1, v2 );
+                            v3 = _mm_cmpeq_epi8( v3, v4 );
+                            path16[pathindex16 + m] = (u16) _mm_movemask_epi8( v2 );
+                            path16[pathindex16 + altm] = (u16) _mm_movemask_epi8( v3 );
+                        }
+                    }
+                }
+                maxc = max16B( maxp );
+                minc = min16B( minp );
+                maxc -= minc;
+                subprice += minc;
+                {
+                    register __m128i mask = _mm_set1_epi32( 0xffffffff );
+                    register __m128i m = _mm_set1_epi8( minc );
+                    for ( i = 0; i < sseheight; i++ ) {
+                        register __m128i res;
+                        register __m128i pr = prices16B[i];
+                        res = _mm_andnot_si128( _mm_cmpeq_epi8( pr, mask ), m );
+                        prices16B[i] = _mm_sub_epi8( pr, res );
+                        ssedone[i] = 0;
+                    }
+                }
+                pathindex += parts;
+                pathindex16 += parts << 1;
+            }
+            {
+                register __m128i mask = _mm_set1_epi32( 0x00ff00ff );
+                if ( minc == 255 ) {
+                    aligned_free( path );
+                    free( ssedone );
+                    free( matrices );
+                    free( widths );
+                    free( columns[0] );
+                    free( columns[1] );
+                    if ( stego != NULL ) free( path );
+                    throw stc_exception( "The syndrome is not in the syndrome matrix range.", 4 );
+                }
+                if ( syndrome[index2] == 0 ) {
+                    for ( i = 0, l = 0; i < sseheight; i += 2, l++ ) {
+                        _mm_store_si128( &prices16B[l], _mm_packus_epi16( _mm_and_si128( _mm_load_si128( &prices16B[i] ), mask ),
+                                _mm_and_si128( _mm_load_si128( &prices16B[i + 1] ), mask ) ) );
+                    }
+                } else {
+                    for ( i = 0, l = 0; i < sseheight; i += 2, l++ ) {
+                        _mm_store_si128( &prices16B[l], _mm_packus_epi16( _mm_and_si128( _mm_srli_si128(_mm_load_si128(&prices16B[i]), 1),
+                                mask ), _mm_and_si128( _mm_srli_si128(_mm_load_si128(&prices16B[i + 1]), 1), mask ) ) );
+                    }
+                }
+                if ( syndromelength - index2 <= matrixheight ) colmask >>= 1;
+                register __m128i fillval = _mm_set1_epi32( 0xffffffff );
+                for ( ; l < sseheight; l++ )
+                    _mm_store_si128( &prices16B[l], fillval );
+            }
+        }
+        totalprice = subprice + prices[0];
+        aligned_free( prices );
+        free( ssedone );
+    }
+    if ( stego != NULL ) {
+        pathindex -= parts;
+        index--;
+        index2--;
+        state = 0;
+        // unused
+        // int h = syndromelength;
+        state = 0;
+        colmask = 0;
+        for ( ; index2 >= 0; index2-- ) {
+            for ( k = widths[index2] - 1; k >= 0; k--, index-- ) {
+                if ( k == widths[index2] - 1 ) {
+                    state = (state << 1) | syndrome[index2];
+                    if ( syndromelength - index2 <= matrixheight ) colmask = (colmask << 1) | 1;
+                }
+                if ( path[pathindex + (state >> 5)] & (1 << (state & 31)) ) {
+                    stego[index] = 1;
+                    state = state ^ (columns[matrices[index2]][k] & colmask);
+                } else {
+                    stego[index] = 0;
+                }
+                pathindex -= parts;
+            }
+        }
+        free( path );
+    }
+    free( matrices );
+    free( widths );
+    free( columns[0] );
+    free( columns[1] );
+    return totalprice;
+}
+// }}}

stc_embed_c.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#ifndef STC_EMBED_C_H
+#define STC_EMBED_C_H
+#include "common.h"
+/* Inputs:
+	cover - the binary cover vector
+	coverlength - length of the cover vector
+	message - the binary message to be hidden
+	messagelength - length of the message
+	profile - the vector of distortion weights (either double if usedouble = true, or u8 id usedouble = false)
+	usedouble - true = use double precision weight, false = use u8 weights
+	stego - pointer to an array of length 'coverlength' to receive the stego message; this parameter can be NULL
+	constr_height - the constraint height of the matrix; the higher, the better the efficiency but the greater the embedding time
+Return value:
+	On success, the function returns the total distortion introduced by the embedding.
+	On error, the function returns -1.
+*/
+double stc_embed(const u8 *cover, int coverlength, const u8 *message, int messagelength, const void *profile, bool usedouble, u8 *stego, int constr_height = 10);
+#endif

stc_extract_c.cpp ADDED Viewed

	@@ -0,0 +1,101 @@

+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <cstdio>
+#include "stc_extract_c.h"
+// {{{ stc_extract()
+int stc_extract(const u8 *vector, int vectorlength, u8 *message, int syndromelength, int matrixheight)
+{
+	int i, j, k, index, index2, base, height;
+	u8 *binmat[2];
+	int *matrices, *widths;
+	height = matrixheight;
+	if(matrixheight > 31) {
+		fprintf(stderr, "Submatrix height must not exceed 31.");
+		return -1;
+	}
+	{
+		double invalpha;
+		int shorter, longer, worm;
+		u32 *columns[2];
+		matrices = (int *)malloc(syndromelength * sizeof(int));
+		widths = (int *)malloc(syndromelength * sizeof(int));
+		invalpha = (double)vectorlength / syndromelength;
+		if(invalpha < 1) {
+			fprintf(stderr, "The message cannot be longer than the cover object.\n");
+			return -1;
+		}
+		shorter = (int)floor(invalpha);
+		longer = (int)ceil(invalpha);
+		if((columns[0] = getMatrix(shorter, matrixheight)) == NULL) {
+			free(widths);
+			free(matrices);
+			return -1;
+		}
+		if((columns[1] = getMatrix(longer, matrixheight)) == NULL) {
+			free(columns[0]);
+			free(widths);
+			free(matrices);
+			return -1;
+		}
+		worm = 0;
+		for(i = 0; i < syndromelength; i++) {
+			if(worm + longer <= (i + 1) * invalpha + 0.5) {
+				matrices[i] = 1;
+				widths[i] = longer;
+				worm += longer;
+			} else {
+				matrices[i] = 0;
+				widths[i] = shorter;
+				worm += shorter;
+			}
+		}
+		binmat[0] = (u8*)malloc(shorter * matrixheight * sizeof(u8));
+		binmat[1] = (u8*)malloc(longer * matrixheight * sizeof(u8));
+		for(i = 0, index = 0; i < shorter; i++) {
+			for(j = 0; j < matrixheight; j++, index++) {
+				binmat[0][index] = (columns[0][i] & (1 << j)) ? 1 : 0;
+			}
+		}
+		for(i = 0, index = 0; i < longer; i++) {
+			for(j = 0; j < matrixheight; j++, index++) {
+				binmat[1][index] = (columns[1][i] & (1 << j)) ? 1 : 0;
+			}
+		}
+		free(columns[0]);
+		free(columns[1]);
+	}
+	for(i = 0; i < syndromelength; i++) {
+		message[i] = 0;
+	}
+	for(index = 0, index2 = 0; index2 < syndromelength; index2++) {
+		for(k = 0, base = 0; k < widths[index2]; k++, index++, base += matrixheight) {
+			if(vector[index]) {
+				for(i = 0; i < height; i++) {
+					message[index2 + i] ^= binmat[matrices[index2]][base + i];
+				}
+			}
+		}
+		if(syndromelength - index2 <= matrixheight)
+			height--;
+	}
+	free(matrices);
+	free(widths);
+	free(binmat[0]);
+	free(binmat[1]);
+	return 0;
+}
+// }}}

stc_extract_c.h ADDED Viewed

	@@ -0,0 +1,19 @@

+#ifndef STC_EXTRACT_C_H
+#define STC_EXTRACT_C_H
+#include "common.h"
+/* Inputs:
+	stego - the binary stego vector
+	stegolength - the length of the stego vector
+	message - pointer to an array of legth 'messagelength' to receive the extracted message
+	messagelegth - the length of the embedded message
+	constr_height - the constraint height of the matrix used for embedding the message
+Return values:
+	0 on succes, -1 on error
+*/
+int stc_extract(const u8 *stego, int stegolength, u8 *message, int messagelength, int constr_height = 10);
+#endif

stc_interface.cpp ADDED Viewed

	@@ -0,0 +1,48 @@

+#include <iostream>
+#include <iomanip>
+#include <cmath>
+#include <cstdlib>
+#include <ctime>
+#include "stc_ml_c.h"
+#include "stc_interface.h"
+uint h = 10;    // constraint height of STC code
+int stc_hide(uint cover_length, int* cover, float* costs,
+             uint message_length, u8* message, int* stego) {
+    const uint n = cover_length;
+    uint m = message_length;
+    // if the message cannot be embedded due to large amount of
+    // wet pixels, then try again with smaller message. Try at most 10 times.
+    uint trials = 10;
+    //std::cout << "message_length: " << message_length << std::endl;
+    unsigned int* num_msg_bits = new unsigned int[2];
+    float dist = stc_pm1_pls_embed(n, cover, costs, m, message, h, 2147483647, stego, num_msg_bits, trials, 0);
+    //std::cout << "hide -->" << num_msg_bits[0] << ", " << num_msg_bits[1] << std::endl;
+    delete[] num_msg_bits;
+    return 0;
+}
+int stc_unhide(uint stego_length, int* stego,
+                  uint message_length, u8* message) {
+    unsigned int* num_msg_bits = new unsigned int[2];
+    num_msg_bits[1] = (uint) (message_length/2);
+    num_msg_bits[0] = message_length-num_msg_bits[1];
+    //std::cout << "message_length: " << message_length << std::endl;
+    //std::cout << "unhide -->" << num_msg_bits[0] << ", " << num_msg_bits[1] << std::endl;
+    stc_ml_extract(stego_length, stego, 2, num_msg_bits, h, message);
+    return 0;
+}

stc_interface.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#ifndef STC_INTERFACE_H
+#define STC_INTERFACE_H
+extern "C" {
+   int stc_hide(uint cover_length, int* cover, float* costs,
+                uint message_length, u8* message, int* stego);
+   int stc_unhide(uint stego_length, int* stego,
+                  uint message_length, u8* message);
+}
+#endif

stc_ml_c.cpp ADDED Viewed

	@@ -0,0 +1,932 @@

+#include "stc_ml_c.h"
+#include <xmmintrin.h>
+#include <cmath>
+#include <limits>
+#include <algorithm>
+#include <sstream>
+#include <fstream>
+#include <iomanip>
+#include <string.h> // due to memcpy
+#include <boost/random/uniform_int.hpp>       // this is required for Marsene-Twister random number generator
+#include <boost/random/variate_generator.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include "stc_embed_c.h"
+#include "stc_extract_c.h"
+#include "sse_mathfun.h"    // library with optimized functions obtained from http://gruntthepeon.free.fr/ssemath/
+// {{{ write_vector_to_file()
+template< class T > void write_vector_to_file( uint n, T *ptr, const char* file_name ) {
+    std::ofstream f( file_name );
+    for ( uint i = 0; i < n; i++ )
+        f << std::left << std::setw( 20 ) << i << std::left << std::setw( 20 ) << ptr[i] << std::endl;
+    f.close();
+}
+// }}}
+// {{{ write_matrix_to_file()
+// write column-ordered matrix into file
+template< class T > void write_matrix_to_file( uint rows, uint columns, T *ptr, const char* file_name ) {
+    std::ofstream f( file_name );
+    for ( uint i = 0; i < rows; i++ ) {
+        f << std::left << std::setw( 20 ) << i;
+        for ( uint j = 0; j < columns; j++ )
+            f << std::left << std::setw( 20 ) << ptr[j * rows + i];
+        f << std::endl;
+    }
+    f.close();
+}
+// }}}
+// {{{ align_*()
+// Templates to handle aligned version of new and delete operators.
+// These functions are necessary for creating arrays aligned address of certain multiples, such as 16.
+template< class T > T* align_new( unsigned int n, unsigned int align_size ) {
+    char *ptr, *ptr2, *aligned_ptr;
+    int align_mask = align_size - 1;
+    ptr = new char[n * sizeof(T) + align_size + sizeof(int)];
+    if ( ptr == 0 ) return 0;
+    ptr2 = ptr + sizeof(int);
+    aligned_ptr = ptr2 + (align_size - ((size_t) ptr2 & align_mask));
+    ptr2 = aligned_ptr - sizeof(int);
+    *((int*) ptr2) = (int) (aligned_ptr - ptr);
+    return (T*) aligned_ptr;
+}
+template< class T > void align_delete( T *ptr ) {
+    int *ptr2 = (int*) ptr - 1;
+    char *p;
+    p = (char*) ptr;
+    p -= *ptr2;
+    delete[] p;
+}
+// }}}
+// {{{ randperm()
+/* Generates random permutation of length n based on the MT random number generator with seed 'seed'. */
+void randperm( uint n, uint seed, uint* perm ) {
+    boost::mt19937 *generator = new boost::mt19937( seed );
+    boost::variate_generator< boost::mt19937, boost::uniform_int< > > *randi = new boost::variate_generator< boost::mt19937,
+        boost::uniform_int< > >( *generator, boost::uniform_int< >( 0, INT_MAX ) );
+    // generate random permutation - this is used to shuffle cover pixels to randomize the effect of different neighboring pixels
+    for ( uint i = 0; i < n; i++ )
+        perm[i] = i;
+    for ( uint i = 0; i < n; i++ ) {
+        uint j = (*randi)() % (n - i);
+        uint tmp = perm[i];
+        perm[i] = perm[i + j];
+        perm[i + j] = tmp;
+    }
+    delete generator;
+    delete randi;
+}
+// }}}
+// {{{ sum_inplace()
+inline float sum_inplace( __m128 x ) {
+    float y;
+    // add all 4 terms from x together
+    x = _mm_add_ps( x, _mm_shuffle_ps(x,x,_MM_SHUFFLE(1,0,3,2)) );
+    x = _mm_add_ps( x, _mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1)) );
+    _mm_store_ss( &y, x );
+    return y;
+}
+// }}}
+// {{{ calc_entropy()
+float calc_entropy( uint n, uint k, float* costs, float lambda ) {
+    float const LOG2 = log( 2.0 );
+    __m128 inf = _mm_set1_ps( F_INF );
+    __m128 v_lambda = _mm_set1_ps( -lambda );
+    __m128 z, d, rho, p, entr, mask;
+    entr = _mm_setzero_ps();
+    for ( uint i = 0; i < n / 4; i++ ) {
+        z = _mm_setzero_ps();
+        d = _mm_setzero_ps();
+        for ( uint j = 0; j < k; j++ ) {
+            rho = _mm_load_ps( costs + j * n + 4 * i ); // costs array must be aligned in memory
+            p = exp_ps( _mm_mul_ps( v_lambda, rho ) );
+            z = _mm_add_ps( z, p );
+            mask = _mm_cmpeq_ps( rho, inf ); // if p<eps, then do not accumulate it to d since x*exp(-x) tends to zero
+            p = _mm_mul_ps( rho, p );
+            p = _mm_andnot_ps( mask, p ); // apply mask
+            d = _mm_add_ps( d, p );
+        }
+        entr = _mm_sub_ps( entr, _mm_div_ps( _mm_mul_ps( v_lambda, d ), z ) );
+        entr = _mm_add_ps( entr, log_ps( z ) );
+    }
+    return sum_inplace( entr ) / LOG2;
+}
+// }}}
+// {{{ get_lambda_entropy()
+float get_lambda_entropy( uint n, uint k, float *costs, float payload, float initial_lambda = 10 ) {
+    float p1, p2, p3, lambda1, lambda2, lambda3;
+    int j = 0;
+    uint iterations = 0;
+    lambda1 = 0;
+    p1 = n * log( (float)k ) / log( 2.0f );
+    lambda3 = initial_lambda;
+    p3 = payload + 1; // this is just an initial value
+    lambda2 = initial_lambda;
+    while ( p3 > payload ) {
+        lambda3 *= 2;
+        p3 = calc_entropy( n, k, costs, lambda3 );
+        j++;
+        iterations++;
+        // beta is probably unbounded => it seems that we cannot find beta such that
+        // relative payload will be smaller than requested. Binary search does not make sence here.
+        if ( j > 10 ) {
+            return lambda3;
+        }
+    }
+    while ( (p1 - p3) / n > payload / n * 1e-2 ) { // binary search for parameter lambda
+        lambda2 = lambda1 + (lambda3 - lambda1) / 2;
+        p2 = calc_entropy( n, k, costs, lambda2 );
+        if ( p2 < payload ) {
+            lambda3 = lambda2;
+            p3 = p2;
+        } else {
+            lambda1 = lambda2;
+            p1 = p2;
+        }
+        iterations++; // this is for monitoring the number of iterations
+    }
+    return lambda1 + (lambda3 - lambda1) / 2;
+}
+// }}}
+// {{{ calc_distortion()
+float calc_distortion( uint n, uint k, float* costs, float lambda ) {
+    __m128 eps = _mm_set1_ps( std::numeric_limits< float >::epsilon() );
+    __m128 v_lambda = _mm_set1_ps( -lambda );
+    __m128 z, d, rho, p, dist, mask;
+    dist = _mm_setzero_ps();
+    for ( uint i = 0; i < n / 4; i++ ) { // n must be multiple of 4
+        z = _mm_setzero_ps();
+        d = _mm_setzero_ps();
+        for ( uint j = 0; j < k; j++ ) {
+            rho = _mm_load_ps( costs + j * n + 4 * i ); // costs array must be aligned in memory
+            p = exp_ps( _mm_mul_ps( v_lambda, rho ) );
+            z = _mm_add_ps( z, p );
+            mask = _mm_cmplt_ps( p, eps ); // if p<eps, then do not accumulate it to d since x*exp(-x) tends to zero
+            p = _mm_mul_ps( rho, p );
+            p = _mm_andnot_ps( mask, p );
+            d = _mm_add_ps( d, p );
+        }
+        dist = _mm_add_ps( dist, _mm_div_ps( d, z ) );
+    }
+    return sum_inplace( dist );
+}
+// }}}
+// {{{ get_lambda_distortion()
+float get_lambda_distortion( uint n, uint k, float *costs, float distortion, float initial_lambda = 10, float precision = 1e-3,
+        uint iter_limit = 30 ) {
+    float dist1, dist2, dist3, lambda1, lambda2, lambda3;
+    int j = 0;
+    uint iterations = 0;
+    lambda1 = 0;
+    dist1 = calc_distortion( n, k, costs, lambda1 );
+    lambda3 = initial_lambda;
+    dist2 = F_INF; // this is just an initial value
+    lambda2 = initial_lambda;
+    dist3 = distortion + 1;
+    while ( dist3 > distortion ) {
+        lambda3 *= 2;
+        dist3 = calc_distortion( n, k, costs, lambda3 );
+        j++;
+        iterations++;
+        // beta is probably unbounded => it seems that we cannot find beta such that
+        // relative payload will be smaller than requested. Binary search cannot converge.
+        if ( j > 10 ) {
+            return lambda3;
+        }
+    }
+    while ( (fabs( dist2 - distortion ) / n > precision) && (iterations < iter_limit) ) { // binary search for parameter lambda
+        lambda2 = lambda1 + (lambda3 - lambda1) / 2;
+        dist2 = calc_distortion( n, k, costs, lambda2 );
+        if ( dist2 < distortion ) {
+            lambda3 = lambda2;
+            dist3 = dist2;
+        } else {
+            lambda1 = lambda2;
+            dist1 = dist2;
+        }
+        iterations++; // this is for monitoring the number of iterations
+    }
+    return lambda1 + (lambda3 - lambda1) / 2;
+}
+// }}}
+// {{{ binary_entropy_array()
+float binary_entropy_array( uint n, float *prob ) {
+    float h = 0;
+    float const LOG2 = log( 2.0 );
+    float const EPS = std::numeric_limits< float >::epsilon();
+    for ( uint i = 0; i < n; i++ )
+        if ( (prob[i] > EPS) && (1 - prob[i] > EPS) ) h -= prob[i] * log( prob[i] ) + (1 - prob[i]) * log( 1 - prob[i] );
+    return h / LOG2;
+}
+// }}}
+// {{{ entropy_array()
+float entropy_array( uint n, float* prob ) {
+    double h = 0;
+    double const LOG2 = log( 2.0 );
+    double const EPS = std::numeric_limits< double >::epsilon();
+    for ( uint i = 0; i < n; i++ )
+        if ( prob[i] > EPS ) h -= prob[i] * log( prob[i] );
+    return h / LOG2;
+}
+// }}}
+// {{{ mod()
+inline uint mod( int x, int m ) {
+    int tmp = x - (x / m) * m + m;
+    return tmp % m;
+}
+// }}}
+/* EMBEDDING ALGORITHMS */
+// {{{ stc_embed_trial()
+void stc_embed_trial( uint n, float* cover_bit_prob0, u8* message, uint stc_constraint_height, uint &num_msg_bits, uint* perm, u8* stego,
+        uint &trial, uint max_trials, const char* debugging_file = "cost.txt" ) {
+    bool success = false;
+    u8* cover = new u8[n];
+    double* cost = new double[n];
+    while ( !success ) {
+        randperm( n, num_msg_bits, perm );
+        for ( uint i = 0; i < n; i++ ) {
+            cover[perm[i]] = (cover_bit_prob0[i] < 0.5) ? 1 : 0;
+            cost[perm[i]] = -log( (1 / std::max( cover_bit_prob0[i], 1 - cover_bit_prob0[i] )) - 1 );
+            if ( cost[perm[i]] != cost[perm[i]] ) // if p20[i]>1 due to numerical error (this is possible due to float data type)
+            cost[perm[i]] = D_INF; // then cost2[i] is NaN, it should be Inf
+        }
+        memcpy( stego, cover, n ); // initialize stego array by cover array
+        // debugging
+        // write_vector_to_file<double>(n, cost, debugging_file);
+        try {
+            if ( num_msg_bits != 0 ) stc_embed( cover, n, message, num_msg_bits, (void*) cost, true, stego, stc_constraint_height );
+            success = true;
+        } catch ( stc_exception& e ) {
+            if ( e.error_id != 4 ) { // error_id=4 means No solution exists, thus we try to embed with different permutation.
+                delete[] cost;
+                delete[] cover;
+                throw e;
+            }
+            num_msg_bits--; // by decreasing the number of  bits, we change the permutation used to shuffle the bits
+            trial++;
+            if ( trial > max_trials ) {
+                delete[] cost;
+                delete[] cover;
+                throw stc_exception( "Maximum number of trials in layered construction exceeded (2).", 6 );
+            }
+        }
+    }
+    delete[] cost;
+    delete[] cover;
+}
+// }}}
+// {{{ check_costs()
+// SANITY CHECKS for cost arrays
+void check_costs( uint n, uint k, float *costs ) {
+    bool test_nan, test_non_inf, test_minus_inf;
+    for ( uint i = 0; i < n; i++ ) {
+        test_nan = false; // Is any element NaN? Should be FALSE
+        test_non_inf = false; // Is any element finite? Should be TRUE
+        test_minus_inf = false; // Is any element minus Inf? should be FALSE
+        for ( uint j = 0; j < k; j++ ) {
+            test_nan |= (costs[k * i + j] != costs[k * i + j]);
+            test_non_inf |= ((costs[k * i + j] != -F_INF) & (costs[k * i + j] != F_INF));
+            test_minus_inf |= (costs[k * i + j] == -F_INF);
+        }
+        if ( test_nan ) {
+            std::stringstream ss;
+            ss << "Incorrect cost array." << i << "-th element contains NaN value. This is not a valid cost.";
+            throw stc_exception( ss.str(), 6 );
+        }
+        if ( !test_non_inf ) {
+            std::stringstream ss;
+            ss << "Incorrect cost array." << i << "-th element does not contain any finite cost value. This is not a valid cost.";
+            throw stc_exception( ss.str(), 6 );
+        }
+        if ( test_minus_inf ) {
+            std::stringstream ss;
+            ss << "Incorrect cost array." << i << "-th element contains -Inf value. This is not a valid cost.";
+            throw stc_exception( ss.str(), 6 );
+        }
+    }
+}
+// }}}
+// {{{ stc_pm1_pls_embed()
+// MULTI-LAYERED EMBEDDING for plus/minus one changes
+// payload limited case - returns distortion
+float stc_pm1_pls_embed( uint cover_length, int* cover, float* costs, uint message_length, u8* message, // input variables
+                         uint stc_constraint_height, float wet_cost,                                    // other input parameters
+                         int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss ) {       // output variables
+    return stc_pm1_dls_embed( cover_length, cover, costs, message_length, message, F_INF, stc_constraint_height, 0, wet_cost, stego,
+            num_msg_bits, max_trials, coding_loss );
+}
+// }}}
+// {{{ stc_pm1_dls_embed()
+// distortion limited case - returns distortion
+float stc_pm1_dls_embed( uint cover_length, int* cover, float* costs, uint message_length, u8* message, float target_distortion, // input variables
+                         uint stc_constraint_height, float expected_coding_loss, float wet_cost,   // other input parameters
+                         int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss ) {  // output variables
+    check_costs( cover_length, 3, costs );
+    float dist = 0;
+    int *stego_values = new int[4 * cover_length];
+    float *costs_ml2 = new float[4 * cover_length];
+    for ( uint i = 0; i < cover_length; i++ ) {
+        costs_ml2[4 * i + mod( (cover[i] - 1 + 4), 4 )] = costs[3 * i + 0]; // set cost of changing by -1
+        stego_values[4 * i + mod( (cover[i] - 1 + 4), 4 )] = cover[i] - 1;
+        costs_ml2[4 * i + mod( (cover[i] + 0 + 4), 4 )] = costs[3 * i + 1]; // set cost of changing by 0
+        stego_values[4 * i + mod( (cover[i] + 0 + 4), 4 )] = cover[i];
+        costs_ml2[4 * i + mod( (cover[i] + 1 + 4), 4 )] = costs[3 * i + 2]; // set cost of changing by +1
+        stego_values[4 * i + mod( (cover[i] + 1 + 4), 4 )] = cover[i] + 1;
+        costs_ml2[4 * i + mod( (cover[i] + 2 + 4), 4 )] = wet_cost; // set cost of changing by +2
+        stego_values[4 * i + mod( (cover[i] + 2 + 4), 4 )] = cover[i] + 2;
+    }
+    // run general 2 layered embedding in distortion limited regime
+    dist = stc_ml2_embed( cover_length, costs_ml2, stego_values, message_length, message, target_distortion, stc_constraint_height,
+            expected_coding_loss, stego, num_msg_bits, max_trials, coding_loss );
+    delete[] costs_ml2;
+    delete[] stego_values;
+    return dist;
+}
+// }}}
+// {{{ stc_pm2_dls_embed()
+// MULTI-LAYERED EMBEDDING for plus/minus one and two changes
+// payload limited case - returns distortion
+float stc_pm2_pls_embed( uint cover_length, int* cover, float* costs, uint message_length, u8* message, // input variables
+        uint stc_constraint_height, float wet_cost, // other input parameters
+        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss ) { // output variables
+    return stc_pm2_dls_embed( cover_length, cover, costs, message_length, message, F_INF, stc_constraint_height, 0, wet_cost, stego,
+            num_msg_bits, max_trials, coding_loss );
+}
+// }}}
+// {{{ stc_pm2_dls_embed()
+// distortion limited case - returns distortion
+float stc_pm2_dls_embed( uint cover_length, int* cover, float* costs, uint message_length, u8* message, float target_distortion, // input variables
+        uint stc_constraint_height, float expected_coding_loss, float wet_cost, // other input parameters
+        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss ) { // output variables
+    check_costs( cover_length, 5, costs );
+    int *stego_values = new int[8 * cover_length];
+    float* costs_ml3 = new float[8 * cover_length];
+    std::fill_n( costs_ml3, 8 * cover_length, wet_cost ); // initialize new cost array
+    for ( uint i = 0; i < cover_length; i++ ) {
+        costs_ml3[8 * i + mod( (cover[i] - 2 + 8), 8 )] = costs[5 * i + 0]; // set cost of changing by -2
+        stego_values[8 * i + mod( (cover[i] - 2 + 8), 8 )] = cover[i] - 2;
+        costs_ml3[8 * i + mod( (cover[i] - 1 + 8), 8 )] = costs[5 * i + 1]; // set cost of changing by -1
+        stego_values[8 * i + mod( (cover[i] - 1 + 8), 8 )] = cover[i] - 1;
+        costs_ml3[8 * i + mod( (cover[i] + 0 + 8), 8 )] = costs[5 * i + 2]; // set cost of changing by 0
+        stego_values[8 * i + mod( (cover[i] + 0 + 8), 8 )] = cover[i] + 0;
+        costs_ml3[8 * i + mod( (cover[i] + 1 + 8), 8 )] = costs[5 * i + 3]; // set cost of changing by +1
+        stego_values[8 * i + mod( (cover[i] + 1 + 8), 8 )] = cover[i] + 1;
+        costs_ml3[8 * i + mod( (cover[i] + 2 + 8), 8 )] = costs[5 * i + 4]; // set cost of changing by +2
+        stego_values[8 * i + mod( (cover[i] + 2 + 8), 8 )] = cover[i] + 2;
+        stego_values[8 * i + mod( (cover[i] + 3 + 8), 8 )] = cover[i] + 3; // these values are not used and are defined
+        stego_values[8 * i + mod( (cover[i] + 4 + 8), 8 )] = cover[i] + 4; // just to have the array complete
+        stego_values[8 * i + mod( (cover[i] + 5 + 8), 8 )] = cover[i] + 5; //
+    }
+    // run general 3 layered embedding in distortion limited regime
+    float dist = stc_ml3_embed( cover_length, costs_ml3, stego_values, message_length, message, target_distortion, stc_constraint_height,
+            expected_coding_loss, stego, num_msg_bits, max_trials, coding_loss );
+    delete[] costs_ml3;
+    delete[] stego_values;
+    return dist;
+}
+// }}}
+// GENERAL MULTI-LAYERED EMBEDDING
+// {{{ stc_ml1_embed()
+// algorithm for embedding into 1 layer, both payload- and distortion-limited case
+float stc_ml1_embed( uint cover_length, int* cover, short* direction, float* costs, uint message_length, u8* message,
+        float target_distortion,// input variables
+        uint stc_constraint_height, float expected_coding_loss, // other input parameters
+        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss ) { // output variables
+    float distortion, lambda = 0, m_max = 0;
+    bool success = false;
+    uint m_actual = 0;
+    uint n = cover_length + 4 - (cover_length % 4); // cover length rounded to multiple of 4
+    uint *perm1 = new uint[n];
+    float* c = align_new< float > ( 2 * n, 16 );
+    std::fill_n( c, 2 * n, F_INF );
+    std::fill_n( c, n, 0 );
+    for ( uint i = 0; i < cover_length; i++ ) { // copy and transpose data for better reading via SSE instructions
+        c[mod( cover[i], 2 ) * n + i] = 0; // cost of not changing the element
+        c[mod( (cover[i] + 1), 2 ) * n + i] = costs[i]; // cost of changing the element
+    }
+    if ( target_distortion != F_INF ) { // distortion-limited sender
+        lambda = get_lambda_distortion( n, 2, c, target_distortion, 2 ); //
+        m_max = (1 - expected_coding_loss) * calc_entropy( n, 2, c, lambda ); //
+        m_actual = std::min( message_length, (uint) floor( m_max ) ); //
+    }
+    if ( (target_distortion == F_INF) || (m_actual < floor( m_max )) ) { // payload-limited sender
+        m_actual = std::min( cover_length, message_length ); // or distortion-limited sender with
+    }
+    /* SINGLE LAYER OF 1ST LSBs */
+    num_msg_bits[0] = m_actual;
+    uint trial = 0;
+    u8* cover1 = new u8[cover_length];
+    double* cost1 = new double[cover_length];
+    u8* stego1 = new u8[cover_length];
+    while ( !success ) {
+        randperm( cover_length, num_msg_bits[0], perm1 );
+        for ( uint i = 0; i < cover_length; i++ ) {
+            cover1[perm1[i]] = mod( cover[i], 2 );
+            cost1[perm1[i]] = costs[i];
+            if ( cost1[perm1[i]] != cost1[perm1[i]] ) cost1[perm1[i]] = D_INF;
+        }
+        memcpy( stego1, cover1, cover_length ); // initialize stego array by cover array
+        // debugging
+        // write_vector_to_file<double>(n, cost, debugging_file);
+        try {
+            if ( num_msg_bits[0] != 0 ) stc_embed( cover1, cover_length, message, num_msg_bits[0], (void*) cost1, true, stego1,
+                    stc_constraint_height );
+            success = true;
+        } catch ( stc_exception& e ) {
+            if ( e.error_id != 4 ) { // error_id=4 means No solution exists, thus we try to embed with different permutation.
+                delete[] cost1;
+                delete[] cover1;
+                delete[] stego1;
+                delete[] perm1;
+                delete[] c;
+                throw e;
+            }
+            num_msg_bits[0]--; // by decreasing the number of  bits, we change the permutation used to shuffle the bits
+            trial++;
+            if ( trial > max_trials ) {
+                delete[] cost1;
+                delete[] cover1;
+                delete[] stego1;
+                delete[] perm1;
+                delete[] c;
+                throw stc_exception( "Maximum number of trials in layered construction exceeded (1).", 6 );
+            }
+        }
+    }
+    /* FINAL CALCULATIONS */
+    distortion = 0;
+    for ( uint i = 0; i < cover_length; i++ ) {
+        stego[i] = (stego1[perm1[i]] == cover1[perm1[i]]) ? cover[i] : cover[i] + direction[i];
+        distortion += (stego1[perm1[i]] == cover1[perm1[i]]) ? 0 : costs[i];
+    }
+    if ( coding_loss != 0 ) {
+        float lambda_dist = get_lambda_distortion( n, 2, c, distortion, lambda, 0, 20 ); // use 20 iterations to make lambda_dist precise
+        float max_payload = calc_entropy( n, 2, c, lambda_dist );
+        (*coding_loss) = (max_payload - m_actual) / max_payload; // fraction of max_payload lost due to practical coding scheme
+    }
+    max_trials = trial;
+    delete[] cost1;
+    delete[] cover1;
+    delete[] stego1;
+    delete[] perm1;
+    align_delete< float > ( c );
+    return distortion;
+}
+// }}}
+// {{{ stc_ml2_embed()
+// algorithm for embedding into 2 layers with possibility to use only 1 layer, both payload- and distortion-limited cases
+float stc_ml2_embed( uint cover_length, float* costs, int* stego_values, uint message_length, u8* message, float target_distortion, // input variables
+        uint stc_constraint_height, float expected_coding_loss, // other input parameters
+        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss ) { // output and optional variables
+    float distortion, dist_coding_loss, lambda = 0, m_max = 0;
+    uint m_actual = 0;
+    uint n = cover_length + 4 - (cover_length % 4); // cover length rounded to multiple of 4
+    check_costs( cover_length, 4, costs );
+    // if only binary embedding is sufficient, then use only 1st LSB layer
+    bool lsb1_only = true;
+    for ( uint i = 0; i < cover_length; i++ ) {
+        uint n_finite_costs = 0; // number of finite cost values
+        uint lsb_xor = 0;
+        for ( uint k = 0; k < 4; k++ )
+            if ( costs[4 * i + k] != F_INF ) {
+                n_finite_costs++;
+                lsb_xor ^= (k % 2);
+            }
+        lsb1_only &= ((n_finite_costs <= 2) & (lsb_xor == 1));
+    }
+    if ( lsb1_only ) { // use stc_ml1_embed method
+        distortion = 0;
+        int *cover = new int[cover_length];
+        short *direction = new short[cover_length];
+        float *costs_ml1 = new float[cover_length];
+        for ( uint i = 0; i < cover_length; i++ ) { // normalize such that minimal element is 0 - this helps numerical stability
+            uint min_id = 0;
+            float f_min = F_INF;
+            for ( uint j = 0; j < 4; j++ )
+                if ( f_min > costs[4 * i + j] ) {
+                    f_min = costs[4 * i + j]; // minimum value
+                    min_id = j; // index of the minimal entry
+                }
+            costs_ml1[i] = F_INF;
+            cover[i] = stego_values[4 * i + min_id];
+            for ( uint j = 0; j < 4; j++ )
+                if ( (costs[4 * i + j] != F_INF) && (min_id != j) ) {
+                    distortion += f_min;
+                    costs_ml1[i] = costs[4 * i + j] - f_min;
+                    direction[i] = stego_values[4 * i + j] - cover[i];
+                }
+        }
+        distortion += stc_ml1_embed( cover_length, cover, direction, costs_ml1, message_length, message, target_distortion,
+                stc_constraint_height, expected_coding_loss, stego, num_msg_bits, max_trials, coding_loss );
+        delete[] direction;
+        delete[] costs_ml1;
+        delete[] cover;
+        return distortion;
+    }
+    // copy and transpose data for faster reading via SSE instructions
+    float* c = align_new< float > ( 4 * n, 16 );
+    std::fill_n( c, 4 * n, F_INF );
+    std::fill_n( c, n, 0 );
+    for ( uint i = 0; i < 4 * cover_length; i++ )
+        c[n * (i % 4) + i / 4] = costs[i];
+    // write_matrix_to_file<float>(n, 4, c, "cost_ml2.txt");
+    for ( uint i = 0; i < n; i++ ) { // normalize such that minimal element is 0 - this helps numerical stability
+        float f_min = F_INF;
+        for ( uint j = 0; j < 4; j++ )
+            f_min = std::min( f_min, c[j * n + i] );
+        for ( uint j = 0; j < 4; j++ )
+            c[j * n + i] -= f_min;
+    }
+    if ( target_distortion != F_INF ) {
+        lambda = get_lambda_distortion( n, 4, c, target_distortion, 2 );
+        m_max = (1 - expected_coding_loss) * calc_entropy( n, 4, c, lambda );
+        m_actual = std::min( message_length, (uint) floor( m_max ) );
+    }
+    if ( (target_distortion == F_INF) || (m_actual < floor( m_max )) ) {
+        m_actual = std::min( 2 * cover_length, message_length );
+        lambda = get_lambda_entropy( n, 4, c, m_actual, 2 );
+    }
+    /*
+     p = exp(-lambda*costs);
+     p = p./(ones(4,1)*sum(p));
+     */
+    float* p = align_new< float > ( 4 * n, 16 );
+    __m128 v_lambda = _mm_set1_ps( -lambda );
+    for ( uint i = 0; i < n / 4; i++ ) {
+        __m128 sum = _mm_setzero_ps();
+        for ( uint j = 0; j < 4; j++ ) {
+            __m128 x = _mm_load_ps( c + j * n + 4 * i );
+            x = exp_ps( _mm_mul_ps( v_lambda, x ) );
+            _mm_store_ps( p + j * n + 4 * i, x );
+            sum = _mm_add_ps( sum, x );
+        }
+        for ( uint j = 0; j < 4; j++ ) {
+            __m128 x = _mm_load_ps( p + j * n + 4 * i );
+            x = _mm_div_ps( x, sum );
+            _mm_store_ps( p + j * n + 4 * i, x );
+        }
+    }
+    // this is for debugging purposes
+    // float payload_dbg = entropy_array(4*n, p);
+    uint trial = 0;
+    float* p10 = new float[cover_length];
+    float* p20 = new float[cover_length];
+    u8* stego1 = new u8[cover_length];
+    u8* stego2 = new u8[cover_length];
+    uint *perm1 = new uint[cover_length];
+    uint *perm2 = new uint[cover_length];
+    /* LAYER OF 2ND LSBs */
+    for ( uint i = 0; i < cover_length; i++ )
+        p20[i] = p[i] + p[i + n]; // p20 = p(1,:)+p(2,:);         % probability of 2nd LSB of stego equal 0
+    //num_msg_bits[1] = (uint) floor( binary_entropy_array( cover_length, p20 ) ); // msg_bits(2) = floor(sum(binary_entropy(p20)));    % number of msg bits embedded into 2nd LSBs
+    num_msg_bits[1] = (uint) (message_length/2 /*+ message_length%2*/ ); // XXX
+    try {
+        stc_embed_trial( cover_length, p20, message, stc_constraint_height, num_msg_bits[1], perm2, stego2, trial, max_trials, "cost2.txt" );
+    } catch ( stc_exception& e ) {
+        delete[] p10;
+        delete[] p20;
+        delete[] perm1;
+        delete[] perm2;
+        delete[] stego1;
+        delete[] stego2;
+        align_delete< float > ( c );
+        align_delete< float > ( p );
+        throw e;
+    }
+    /* LAYER OF 1ST LSBs */
+    for ( uint i = 0; i < cover_length; i++ ) //
+        if ( stego2[perm2[i]] == 0 ) // % conditional probability of 1st LSB of stego equal 0 given LSB2=0
+        p10[i] = p[i] / (p[i] + p[i + n]); // p10(i) = p(1,i)/(p(1,i)+p(2,i));
+        else // % conditional probability of 1st LSB of stego equal 0 given LSB2=1
+        p10[i] = p[i + 2 * n] / (p[i + 2 * n] + p[i + 3 * n]); // p10(i) = p(3,i)/(p(3,i)+p(4,i));
+    num_msg_bits[0] = m_actual - num_msg_bits[1]; // msg_bits(1) = m_actual-msg_bits(2); % number of msg bits embedded into 1st LSBs
+    try {
+        stc_embed_trial( cover_length, p10, message + num_msg_bits[1], stc_constraint_height, num_msg_bits[0], perm1, stego1, trial,
+                max_trials, "cost1.txt" );
+    } catch ( stc_exception& e ) {
+        delete[] p10;
+        delete[] p20;
+        delete[] perm1;
+        delete[] perm2;
+        delete[] stego1;
+        delete[] stego2;
+        align_delete< float > ( c );
+        align_delete< float > ( p );
+        throw e;
+    }
+    delete[] p10;
+    delete[] p20;
+    /* FINAL CALCULATIONS */
+    distortion = 0;
+    for ( uint i = 0; i < cover_length; i++ ) {
+        stego[i] = stego_values[4 * i + 2 * stego2[perm2[i]] + stego1[perm1[i]]];
+        distortion += costs[4 * i + 2 * stego2[perm2[i]] + stego1[perm1[i]]];
+    }
+    if ( coding_loss != 0 ) {
+        dist_coding_loss = 0;
+        for ( uint i = 0; i < cover_length; i++ )
+            dist_coding_loss += c[i + n * (2 * stego2[perm2[i]] + stego1[perm1[i]])];
+        float lambda_dist = get_lambda_distortion( n, 4, c, dist_coding_loss, lambda, 0, 20 ); // use 20 iterations to make lambda_dist precise
+        float max_payload = calc_entropy( n, 4, c, lambda_dist );
+        (*coding_loss) = (max_payload - m_actual) / max_payload; // fraction of max_payload lost due to practical coding scheme
+    }
+    max_trials = trial;
+    delete[] stego1;
+    delete[] stego2;
+    delete[] perm1;
+    delete[] perm2;
+    align_delete< float > ( c );
+    align_delete< float > ( p );
+    return distortion;
+}
+// }}}
+// {{{ stc_ml3_embed()
+// algorithm for embedding into 3 layers, both payload- and distortion-limited case
+float stc_ml3_embed( uint cover_length, float* costs, int* stego_values, uint message_length, u8* message, float target_distortion, // input variables
+        uint stc_constraint_height, float expected_coding_loss, // other input parameters
+        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss ) { // output and optional variables
+    float distortion, dist_coding_loss, lambda = 0, m_max = 0;
+    uint m_actual = 0;
+    uint n = cover_length + 4 - (cover_length % 4); // cover length rounded to multiple of 4
+    check_costs( cover_length, 8, costs );
+    float* c = align_new< float > ( 8 * n, 16 );
+    std::fill_n( c, 8 * n, F_INF );
+    std::fill_n( c, n, 0 );
+    for ( uint i = 0; i < 8 * cover_length; i++ )
+        c[n * (i % 8) + i / 8] = costs[i]; // copy and transpose data for better reading via SSE instructions
+    // write_matrix_to_file<float>(n, 8, c, "cost_ml3.txt");
+    for ( uint i = 0; i < n; i++ ) { // normalize such that minimal element is 0 - this helps numerical stability
+        float f_min = F_INF;
+        for ( uint j = 0; j < 8; j++ )
+            f_min = std::min( f_min, c[j * n + i] );
+        for ( uint j = 0; j < 8; j++ )
+            c[j * n + i] -= f_min;
+    }
+    if ( target_distortion != F_INF ) {
+        lambda = get_lambda_distortion( n, 8, c, target_distortion, 2.0 );
+        m_max = (1 - expected_coding_loss) * calc_entropy( n, 8, c, lambda );
+        m_actual = std::min( message_length, (uint) floor( m_max ) );
+    }
+    if ( (target_distortion == F_INF) || (m_actual < floor( m_max )) ) {
+        m_actual = std::min( 3 * cover_length, message_length );
+        lambda = get_lambda_entropy( n, 8, c, m_actual, 2.0 );
+    }
+    /*
+     p = exp(-lambda*costs);
+     p = p./(ones(8,1)*sum(p));
+     */
+    float* p = align_new< float > ( 8 * n, 16 );
+    __m128 v_lambda = _mm_set1_ps( -lambda );
+    for ( uint i = 0; i < n / 4; i++ ) {
+        __m128 sum = _mm_setzero_ps();
+        for ( uint j = 0; j < 8; j++ ) {
+            __m128 x = _mm_load_ps( c + j * n + 4 * i );
+            x = exp_ps( _mm_mul_ps( v_lambda, x ) );
+            _mm_store_ps( p + j * n + 4 * i, x );
+            sum = _mm_add_ps( sum, x );
+        }
+        for ( uint j = 0; j < 8; j++ ) {
+            __m128 x = _mm_load_ps( p + j * n + 4 * i );
+            x = _mm_div_ps( x, sum );
+            _mm_store_ps( p + j * n + 4 * i, x );
+        }
+    }
+    // this is for debugging
+    // float payload_dbg = entropy_array(8*n, p);
+    uint trial = 0;
+    float* p10 = new float[cover_length];
+    float* p20 = new float[cover_length];
+    float* p30 = new float[cover_length];
+    u8* stego1 = new u8[cover_length];
+    u8* stego2 = new u8[cover_length];
+    u8* stego3 = new u8[cover_length];
+    uint *perm1 = new uint[cover_length];
+    uint *perm2 = new uint[cover_length];
+    uint *perm3 = new uint[cover_length];
+    /* LAYER OF 3RD LSBs */
+    for ( uint i = 0; i < cover_length; i++ )
+        p30[i] = p[i] + p[i + n] + p[i + 2 * n] + p[i + 3 * n]; //
+    num_msg_bits[2] = (uint) floor( binary_entropy_array( cover_length, p30 ) ); //
+    try {
+        stc_embed_trial( cover_length, p30, message, stc_constraint_height, num_msg_bits[2], perm3, stego3, trial, max_trials, "cost3.txt" );
+    } catch ( stc_exception& e ) {
+        delete[] p10;
+        delete[] p20;
+        delete[] p30;
+        delete[] perm1;
+        delete[] perm2;
+        delete[] perm3;
+        delete[] stego1;
+        delete[] stego2;
+        delete[] stego3;
+        align_delete< float > ( c );
+        align_delete< float > ( p );
+        throw e;
+    }
+    /* LAYER OF 2ND LSBs */
+    for ( uint i = 0; i < cover_length; i++ ) { //
+        int s = 4 * stego3[perm3[i]]; // % conditional probability of 2nd LSB of stego equal 0 given LSB3
+        p20[i] = (p[i + s * n] + p[i + (s + 1) * n]) / (p[i + s * n] + p[i + (s + 1) * n] + p[i + (s + 2) * n] + p[i + (s + 3) * n]);
+    }
+    num_msg_bits[1] = (uint) floor( binary_entropy_array( cover_length, p20 ) );// msg_bits(2) = floor(sum(binary_entropy(p20)));    % number of msg bits embedded into 2nd LSBs
+    try {
+        stc_embed_trial( cover_length, p20, message + num_msg_bits[2], stc_constraint_height, num_msg_bits[1], perm2, stego2, trial,
+                max_trials, "cost2.txt" );
+    } catch ( stc_exception& e ) {
+        delete[] p10;
+        delete[] p20;
+        delete[] p30;
+        delete[] perm1;
+        delete[] perm2;
+        delete[] perm3;
+        delete[] stego1;
+        delete[] stego2;
+        delete[] stego3;
+        align_delete< float > ( c );
+        align_delete< float > ( p );
+        throw e;
+    }
+    /* LAYER OF 1ST LSBs */
+    for ( uint i = 0; i < cover_length; i++ ) { //
+        int s = 4 * stego3[perm3[i]] + 2 * stego2[perm2[i]]; // % conditional probability of 1st LSB of stego equal 0 given LSB3 and LSB2
+        p10[i] = p[i + s * n] / (p[i + s * n] + p[i + (s + 1) * n]);
+    }
+    num_msg_bits[0] = m_actual - num_msg_bits[1] - num_msg_bits[2]; // msg_bits(1) = m_actual-msg_bits(2)-msg_bits(3); % number of msg bits embedded into 1st LSBs
+    try {
+        stc_embed_trial( cover_length, p10, message + num_msg_bits[1] + num_msg_bits[2], stc_constraint_height, num_msg_bits[0], perm1,
+                stego1, trial, max_trials, "cost1.txt" );
+    } catch ( stc_exception& e ) {
+        delete[] p10;
+        delete[] p20;
+        delete[] p30;
+        delete[] perm1;
+        delete[] perm2;
+        delete[] perm3;
+        delete[] stego1;
+        delete[] stego2;
+        delete[] stego3;
+        align_delete< float > ( c );
+        align_delete< float > ( p );
+        throw e;
+    }
+    delete[] p10;
+    delete[] p20;
+    delete[] p30;
+    max_trials = trial;
+    /* FINAL CALCULATIONS */
+    distortion = 0;
+    for ( uint i = 0; i < cover_length; i++ ) {
+        stego[i] = stego_values[8 * i + 4 * stego3[perm3[i]] + 2 * stego2[perm2[i]] + stego1[perm1[i]]];
+        distortion += costs[8 * i + 4 * stego3[perm3[i]] + 2 * stego2[perm2[i]] + stego1[perm1[i]]];
+    }
+    if ( coding_loss != 0 ) {
+        dist_coding_loss = 0;
+        for ( uint i = 0; i < cover_length; i++ )
+            dist_coding_loss += c[i + n * (4 * stego3[perm3[i]] + 2 * stego2[perm2[i]] + stego1[perm1[i]])];
+        float lambda_dist = get_lambda_distortion( n, 8, c, dist_coding_loss, lambda, 0, 20 ); // use 20 iterations to make lambda_dist precise
+        float max_payload = calc_entropy( n, 8, c, lambda_dist );
+        (*coding_loss) = (max_payload - m_actual) / max_payload; // fraction of max_payload lost due to practical coding scheme
+    }
+    delete[] perm1;
+    delete[] perm2;
+    delete[] perm3;
+    delete[] stego1;
+    delete[] stego2;
+    delete[] stego3;
+    align_delete< float > ( c );
+    align_delete< float > ( p );
+    return distortion;
+}
+// }}}
+/* EXTRACTION ALGORITHMS */
+// {{{ stc_ml_extract()
+/** Extraction algorithm for any l-layered construction.
+ @param stego_length - ...
+ @param stego - ...
+ @param msg_bits - ...
+ @param stc_constraint_height - ...
+ @param message - ...
+ */
+void stc_ml_extract( uint stego_length, int* stego, uint num_of_layers, uint* num_msg_bits, // input variables
+                     uint stc_constraint_height, // other input parameters
+                     u8* message ) { // output variables
+    u8* stego_bits = new u8[stego_length];
+    u8* msg_ptr = message;
+    uint *perm = new uint[stego_length];
+    for ( uint l = num_of_layers; l > 0; l-- ) { // extract message from every layer starting from most significant ones
+        // extract bits from l-th LSB plane
+        if ( num_msg_bits[l - 1] > 0 ) {
+            randperm( stego_length, num_msg_bits[l - 1], perm );
+            for ( uint i = 0; i < stego_length; i++ )
+                stego_bits[perm[i]] = mod( stego[i], (1 << l) ) >> (l - 1);
+            stc_extract( stego_bits, stego_length, msg_ptr, num_msg_bits[l - 1], stc_constraint_height );
+            msg_ptr += num_msg_bits[l - 1];
+        }
+    }
+    delete[] stego_bits;
+    delete[] perm;
+}
+// }}}

stc_ml_c.h ADDED Viewed

	@@ -0,0 +1,64 @@

+#ifndef STC_ML_H
+#define STC_ML_H
+#include <limits>
+#include "common.h"
+#include "stc_embed_c.h"
+#include "stc_extract_c.h"
+typedef unsigned int uint;
+typedef unsigned char u8;
+const float F_INF = std::numeric_limits<float>::infinity();
+const float D_INF = std::numeric_limits<double>::infinity();
+// EMBEDDING ALGORITHMS ***********************************************************************************************************
+// MULTI-LAYERED EMBEDDING for plus/minus one changes
+// payload limited case - returns distortion
+float stc_pm1_pls_embed(uint cover_length, int* cover, float* costs, uint message_length, u8* message,                             // input variables
+                        uint stc_constraint_height, float wet_cost,                                                                // other input parameters
+                        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss = 0);                                                   // output variables
+// distortion limited case - returns distortion
+float stc_pm1_dls_embed(uint cover_length, int* cover, float* costs, uint message_length, u8* message, float target_distortion,    // input variables
+                        uint stc_constraint_height, float expected_coding_loss, float wet_cost,                                    // other input parameters
+                        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss = 0);                                                   // output variables
+// MULTI-LAYERED EMBEDDING for plus/minus one and two changes
+// payload limited case - returns distortion
+float stc_pm2_pls_embed(uint cover_length, int* cover, float* costs, uint message_length, u8* message,                             // input variables
+                        uint stc_constraint_height, float wet_cost,                                                                // other input parameters
+                        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss = 0);                                                   // output variables
+// distortion limited case - returns distortion
+float stc_pm2_dls_embed(uint cover_length, int* cover, float* costs, uint message_length, u8* message, float target_distortion,    // input variables
+                        uint stc_constraint_height, float expected_coding_loss, float wet_cost,                                    // other input parameters
+                        int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss = 0);                                                   // output variables
+// GENERAL MULTI-LAYERED EMBEDDING
+// algorithm for embedding into 1 layer, both payload- and distortion-limited case
+float stc_ml1_embed(uint cover_length, int* cover, short* direction, float* costs, uint message_length, u8* message, float target_distortion,// input variables
+                    uint stc_constraint_height, float expected_coding_loss,                                                        // other input parameters
+                    int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss = 0);                                     // output variables
+// algorithm for embedding into 2 layers, both payload- and distortion-limited case
+float stc_ml2_embed(uint cover_length, float* costs, int* stego_values, uint message_length, u8* message, float target_distortion,        // input variables
+                    uint stc_constraint_height, float expected_coding_loss,                                                        // other input parameters
+                    int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss = 0);                                     // output and optional variables
+// algorithm for embedding into 3 layers, both payload- and distortion-limited case
+float stc_ml3_embed(uint cover_length, float* costs, int* stego_values, uint message_length, u8* message, float target_distortion,        // input variables
+                    uint stc_constraint_height, float expected_coding_loss,                                                        // other input parameters
+                    int* stego, uint* num_msg_bits, uint &max_trials, float* coding_loss = 0);                                     // output and optional variables
+// EXTRACTION ALGORITHMS **********************************************************************************************************
+/** Extraction algorithm for 2 layered construction. Can be used with: stc_pm1_pls_embed, stc_pm1_dls_embed, stc_ml2_embed
+    @param stego_length - ...
+    @param stego - ...
+    @param msg_bits - ...
+    @param stc_constraint_height - ...
+    @param message - ...
+*/
+void stc_ml_extract(uint stego_length, int* stego, uint num_of_layers, uint* num_msg_bits, // input variables
+                    uint stc_constraint_height,                                            // other input parameters
+                    u8* message);                                                          // output variables
+#endif // STC_ML_H