Merge branch 'adamierymenko-dev' into android-jni

11 years ago · 6d398beefd
5 changed files with 68 additions and 42 deletions
--- a/make-freebsd.mk
+++ b/make-freebsd.mk
@ -1,5 +1,5 @@
-CC=cc
-CXX=c++
+CC?=cc
+CXX?=c++

 INCLUDES=
 DEFS=
@ -16,19 +16,20 @@ endif
 # "make debug" is a shortcut for this
 ifeq ($(ZT_DEBUG),1)
 	DEFS+=-DZT_TRACE 
-	CFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS)
-	LDFLAGS=
+	CFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS)
+	LDFLAGS+=
 	STRIP=echo
 	# The following line enables optimization for the crypto code, since
 	# C25519 in particular is almost UNUSABLE in heavy testing without it.
 ext/lz4/lz4.o node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o: CFLAGS = -Wall -O2 -g -pthread $(INCLUDES) $(DEFS)
 else
-	CFLAGS=-Wall -O3 -fPIE -fvisibility=hidden -fstack-protector -pthread $(INCLUDES) -DNDEBUG $(DEFS)
-	LDFLAGS=-pie -Wl,-z,relro,-z,now
+	CFLAGS?=-O3 -fstack-protector
+	CFLAGS+=-Wall -fPIE -fvisibility=hidden -fstack-protector -pthread $(INCLUDES) -DNDEBUG $(DEFS)
+	LDFLAGS+=-pie -Wl,-z,relro,-z,now
 	STRIP=strip --strip-all
 endif

-CXXFLAGS=$(CFLAGS) -fno-rtti
+CXXFLAGS+=$(CFLAGS) -fno-rtti

 all:	one

--- a/make-linux.mk
+++ b/make-linux.mk
@ -1,7 +1,25 @@
-# Pick clang or gcc, with preference for clang
-CC=$(shell if [ -e /usr/bin/clang ]; then echo clang; else echo gcc; fi)
-CXX=$(shell if [ -e /usr/bin/clang++ ]; then echo clang++; else echo g++; fi)
+#
+# Makefile for ZeroTier One on Linux
+#
+# This is confirmed to work on distributions newer than CentOS 6 (the
+# one used for reference builds) and on 32 and 64 bit x86 and ARM
+# machines. It should also work on other 'normal' machines and recent
+# distributions. Editing might be required for tiny devices or weird
+# distros.
+#
+# Targets
+#   one: zerotier-one and symlinks (cli and idtool)
+#   all: builds 'one'
+#   selftest: zerotier-selftest
+#   debug: builds 'one' and 'selftest' with tracing and debug flags
+#   installer: ZeroTierOneInstaller-... and packages (if possible)
+#   official: builds 'one' and 'installer'
+#   clean: removes all built files, objects, other trash
+#

+# Automagically pick clang or gcc, with preference for clang
+CC?=$(shell if [ -e /usr/bin/clang ]; then echo clang; else echo gcc; fi)
+CXX?=$(shell if [ -e /usr/bin/clang++ ]; then echo clang++; else echo g++; fi)
 INCLUDES=
 DEFS=
 LIBS=
@ -24,25 +42,28 @@ endif
 # "make debug" is a shortcut for this
 ifeq ($(ZT_DEBUG),1)
 	DEFS+=-DZT_TRACE 
-	CFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS)
+	CFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS)
+	CXXFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS)
 	LDFLAGS=
 	STRIP=echo
 	# The following line enables optimization for the crypto code, since
-	# C25519 in particular is almost UNUSABLE in heavy testing without it.
+	# C25519 in particular is almost UNUSABLE in -O0 even on a 3ghz box!
 ext/lz4/lz4.o node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o: CFLAGS = -Wall -O2 -g -pthread $(INCLUDES) $(DEFS)
 else
-	CFLAGS=-Wall -O3 -fPIE -fvisibility=hidden -fstack-protector -pthread $(INCLUDES) -DNDEBUG $(DEFS)
+	CFLAGS?=-O3 -fstack-protector
+	CFLAGS+=-Wall -fPIE -fvisibility=hidden -pthread $(INCLUDES) -DNDEBUG $(DEFS)
+	CXXFLAGS?=-O3 -fstack-protector
+	CXXFLAGS+=-Wall -fPIE -fvisibility=hidden -fno-rtti -pthread $(INCLUDES) -DNDEBUG $(DEFS)
 	LDFLAGS=-pie -Wl,-z,relro,-z,now
 	STRIP=strip --strip-all
 endif

 # Uncomment for gprof profile build
 #CFLAGS=-Wall -g -pg -pthread $(INCLUDES) $(DEFS)
+#CXXFLAGS=-Wall -g -pg -pthread $(INCLUDES) $(DEFS)
 #LDFLAGS=
 #STRIP=echo

-CXXFLAGS=$(CFLAGS) -fno-rtti
-
 all:	one

 one:	$(OBJS) one.o
@ -62,7 +83,8 @@ clean:
 	rm -rf *.o node/*.o controller/*.o osdep/*.o service/*.o ext/http-parser/*.o ext/lz4/*.o ext/json-parser/*.o zerotier-one zerotier-idtool zerotier-cli zerotier-selftest build-* ZeroTierOneInstaller-* *.deb *.rpm

 debug:	FORCE
-	make -j 4 ZT_DEBUG=1
+	make ZT_DEBUG=1 one
+	make ZT_DEBUG=1 selftest

 official: FORCE
 	make -j 4 ZT_OFFICIAL_RELEASE=1
--- a/make-mac.mk
+++ b/make-mac.mk
@ -1,5 +1,5 @@
-CC=clang
-CXX=clang++
+CC?=clang
+CXX?=clang++

 INCLUDES=-I/usr/local/include
 DEFS=
@ -38,13 +38,14 @@ endif
 # Debug mode -- dump trace output, build binary with -g
 ifeq ($(ZT_DEBUG),1)
 	DEFS+=-DZT_TRACE 
-	CFLAGS=-Wall -g -pthread $(INCLUDES) $(DEFS)
+	CFLAGS+=-Wall -g -pthread $(INCLUDES) $(DEFS)
 	STRIP=echo
 	# The following line enables optimization for the crypto code, since
 	# C25519 in particular is almost UNUSABLE in heavy testing without it.
 ext/lz4/lz4.o node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o: CFLAGS = -Wall -O2 -g -pthread $(INCLUDES) $(DEFS)
 else
-	CFLAGS=$(ARCH_FLAGS) -Wall -O3 -flto -fPIE -fvectorize -fstack-protector -pthread -mmacosx-version-min=10.7 -DNDEBUG -Wno-unused-private-field $(INCLUDES) $(DEFS)
+	CFLAGS?=-O3 -fstack-protector
+	CFLAGS+=$(ARCH_FLAGS) -Wall -flto -fPIE -fvectorize -pthread -mmacosx-version-min=10.7 -DNDEBUG -Wno-unused-private-field $(INCLUDES) $(DEFS)
 	STRIP=strip
 endif

--- a/node/Salsa20.cpp
+++ b/node/Salsa20.cpp
@ -1,51 +1,53 @@
 /*
 * Based on public domain code available at: http://cr.yp.to/snuffle.html
 *
- * This therefore is public domain.
+ * Modifications and C-native SSE macro based SSE implementation by
+ * Adam Ierymenko <adam.ierymenko@zerotier.com>.
+ *
+ * Since the original was public domain, this is too.
 */

-#include "Salsa20.hpp"
 #include "Constants.hpp"
+#include "Salsa20.hpp"

 #define ROTATE(v,c) (((v) << (c)) | ((v) >> (32 - (c))))
 #define XOR(v,w) ((v) ^ (w))
 #define PLUS(v,w) ((uint32_t)((v) + (w)))

+// Set up laod/store macros with appropriate endianness (we don't use these in SSE mode)
 #ifndef ZT_SALSA20_SSE

 #if __BYTE_ORDER == __LITTLE_ENDIAN

-/* We have a slower version of these macros for CPU/compiler combos that
- * do not allow unaligned access to a uint32_t. Another solution would be
- * to methodically require alignment across the code, but this is quicker
- * for now. The culprit appears to be some Android-based ARM devices. */
-#if 1
-#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) )
-static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v)
-{
-	c[0] = (uint8_t)v;
-	c[1] = (uint8_t)(v >> 8);
-	c[2] = (uint8_t)(v >> 16);
-	c[3] = (uint8_t)(v >> 24);
-}
-#else
+// Slow version that does not use type punning
+//#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) )
+//static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); }
+
+// Fast version that just does 32-bit load/store
 #define U8TO32_LITTLE(p) (*((const uint32_t *)((const void *)(p))))
 #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = (v)
-#endif

-#else // big endian
+#else // __BYTE_ORDER == __BIG_ENDIAN (we don't support anything else... does MIDDLE_ENDIAN even still exist?)

 #ifdef __GNUC__
+
+// Use GNUC builtin bswap macros on big-endian machines if available
 #define U8TO32_LITTLE(p) __builtin_bswap32(*((const uint32_t *)((const void *)(p))))
 #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = __builtin_bswap32((v))
-#else // no bswap stuff... need to do it manually?
-error need be;
+
+#else // no __GNUC__
+
+// Otherwise do it the slow, manual way on BE machines
+#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) )
+static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); }
+
 #endif // __GNUC__ or not

-#endif // little/big endian
+#endif // __BYTE_ORDER little or big?

 #endif // !ZT_SALSA20_SSE

+// Statically compute and define SSE constants
 #ifdef ZT_SALSA20_SSE
 class _s20sseconsts
 {
--- a/node/Salsa20.hpp
+++ b/node/Salsa20.hpp
@ -78,7 +78,7 @@ public:
 	}

 private:
-	volatile union {
+	union {
 #ifdef ZT_SALSA20_SSE
 		__m128i v[4];
 #endif // ZT_SALSA20_SSE