Got TensorFlow R0.9 working on TX1 with Bazel 0.2.1, CUDA 8.0, CUDNN5.1, L4T24.2, and fresh JetPack 2.3 install. I've tested it with basic MLP, Conv, and LSTM nets using BN, Sigmoid, ReLU etc with no errors yet. I removed sparse_matmul_op though otherwise believe compilation should be fully operational. Many of these steps come directly from MaxCuda's excellent blog, so huge thanks to them for providing.
Plan to continue hammering on R0.10/R0.11 (gRPC binary is preventing Bazel 0.3.0 right now) but until then figured I'd post the R0.9 formula. As below:
First get java
sudo add-apt-repository ppa:webupd8team/java
sudo apt-get update
sudo apt-get install oracle-java8-installer
Install some other deps
sudo apt-get install git zip unzip autoconf automake libtool curl zlib1g-dev maven swig
Need to build protobuf 3.0.0-beta-2 jar yourself
git clone https://github.com/google/protobuf.git
cd protobuf
# autogen.sh downloads broken gmock.zip in d5fb408d
git checkout master
./autogen.sh
git checkout d5fb408d
./configure --prefix=/usr
make -j 4
sudo make install
cd java
mvn package
Get bazel. We want version 0.2.1, it doesn't require gRPC binary unlike 0.3.0 which I can't build yet (maybe soon!)
git clone https://github.com/bazelbuild/bazel.git
cd bazel
git checkout 0.2.1
cp /usr/bin/protoc third_party/protobuf/protoc-linux-arm32.exe
cp ../protobuf/java/target/protobuf-java-3.0.0-beta-2.jar third_party/protobuf/protobuf-java-3.0.0-beta-1.jar
Need to edit a bazel file to recognize aarch64 as ARM
--- a/src/main/java/com/google/devtools/build/lib/util/CPU.java
+++ b/src/main/java/com/google/devtools/build/lib/util/CPU.java
@@ -25,7 +25,7 @@ import java.util.Set;
public enum CPU {
X86_32("x86_32", ImmutableSet.of("i386", "i486", "i586", "i686", "i786", "x86")),
X86_64("x86_64", ImmutableSet.of("amd64", "x86_64", "x64")),
- ARM("arm", ImmutableSet.of("arm", "armv7l")),
+ ARM("arm", ImmutableSet.of("arm", "armv7l", "aarch64")),
UNKNOWN("unknown", ImmutableSet.<String>of());
Now compile
./compile.sh
And install
sudo cp output/bazel /usr/local/bin
Get tensorflow R0.9. Higher than R0.9 requires Bazel 0.3.0 which I haven't figured out how to build yet due to gRPC issues.
git clone -b r0.9 https://github.com/tensorflow/tensorflow.git
Build once. It will fail, but now you have the bazel .cache dir where you can place updated config.guess & config.sub files that will figure what architecture you're running
./configure
bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
cd ~
wget -O config.guess 'http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD'
wget -O config.sub 'http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD'
# below are commands I ran, yours will vary depending on .cache details. `find` is your friend
cp config.guess ./.cache/bazel/_bazel_socialh/742c01ff0765b098544431b60b1eed9f/external/farmhash_archive/farmhash-34c13ddfab0e35422f4c3979f360635a8c050260/config.guess
cp config.sub ./.cache/bazel/_bazel_socialh/742c01ff0765b098544431b60b1eed9f/external/farmhash_archive/farmhash-34c13ddfab0e35422f4c3979f360635a8c050260/config.sub
sparse_matmul_op had a couple errors, I took the cowardly route and removed from the build
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -985,7 +985,7 @@ tf_kernel_libraries(
"reduction_ops",
"segment_reduction_ops",
"sequence_ops",
- "sparse_matmul_op",
+ #DC "sparse_matmul_op",
],
deps = [
":bounds_check",
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1110,7 +1110,7 @@ medium_kernel_test_list = glob([
"kernel_tests/seq2seq_test.py",
"kernel_tests/slice_op_test.py",
"kernel_tests/sparse_ops_test.py",
- "kernel_tests/sparse_matmul_op_test.py",
+ #DC "kernel_tests/sparse_matmul_op_test.py",
"kernel_tests/sparse_tensor_dense_matmul_op_test.py",
])
TX1 can't do fancy constructors in cwise_op_gpu_select.cu.cc
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -43,8 +43,14 @@ struct BatchSelectFunctor<GPUDevice, T> {
const int all_but_batch = then_flat_outer_dims.dimension(1);
#if !defined(EIGEN_HAS_INDEX_LIST)
- Eigen::array<int, 2> broadcast_dims{{ 1, all_but_batch }};
- Eigen::Tensor<int, 2>::Dimensions reshape_dims{{ batch, 1 }};
+ //DC Eigen::array<int, 2> broadcast_dims{{ 1, all_but_batch }};
+ Eigen::array<int, 2> broadcast_dims;
+ broadcast_dims[0] = 1;
+ broadcast_dims[1] = all_but_batch;
+ //DC Eigen::Tensor<int, 2>::Dimensions reshape_dims{{ batch, 1 }};
+ Eigen::Tensor<int, 2>::Dimensions reshape_dims;
+ reshape_dims[0] = batch;
+ reshape_dims[1] = 1;
#else
Eigen::IndexList<Eigen::type2index<1>, int> broadcast_dims;
broadcast_dims.set(1, all_but_batch);
Same in sparse_tensor_dense_matmul_op_gpu.cu.cc
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -104,9 +104,17 @@ struct SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, ADJ_B> {
int n = (ADJ_B) ? b.dimension(0) : b.dimension(1);
#if !defined(EIGEN_HAS_INDEX_LIST)
- Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz{{ 1, nnz }};
- Eigen::array<int, 2> n_by_1{{ n, 1 }};
- Eigen::array<int, 1> reduce_on_rows{{ 0 }};
+ //DC Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz{{ 1, nnz }};
+ Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz;
+ matrix_1_by_nnz[0] = 1;
+ matrix_1_by_nnz[1] = nnz;
+ //DC Eigen::array<int, 2> n_by_1{{ n, 1 }};
+ Eigen::array<int, 2> n_by_1;
+ n_by_1[0] = n;
+ n_by_1[1] = 1;
+ //DC Eigen::array<int, 1> reduce_on_rows{{ 0 }};
+ Eigen::array<int, 1> reduce_on_rows;
+ reduce_on_rows[0] = 0;
#else
Eigen::IndexList<Eigen::type2index<1>, int> matrix_1_by_nnz;
matrix_1_by_nnz.set(1, nnz);
Running with CUDA 8.0 requires new macros for FP16. Many thanks to Kashif/Mrry for pointing out the fix!
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -25,6 +25,12 @@ limitations under the License.
#define EIGEN_HAS_CUDA_FP16
#endif
+#if CUDA_VERSION >= 8000
+#define SE_CUDA_DATA_HALF CUDA_R_16F
+#else
+#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF
+#endif
+
#include "tensorflow/stream_executor/cuda/cuda_blas.h"
#include <dlfcn.h>
@@ -1680,10 +1686,10 @@ bool CUDABlas::DoBlasGemm(
return DoBlasInternal(
dynload::cublasSgemmEx, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
- CUDAMemory(a), CUBLAS_DATA_HALF, lda,
- CUDAMemory(b), CUBLAS_DATA_HALF, ldb,
+ CUDAMemory(a), SE_CUDA_DATA_HALF, lda,
+ CUDAMemory(b), SE_CUDA_DATA_HALF, ldb,
&beta,
- CUDAMemoryMutable(c), CUBLAS_DATA_HALF, ldc);
+ CUDAMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
#else
LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
<< "(need at least CUDA 7.5)";
And lastly ARM has no NUMA nodes so this needs to be added or you will get an immediate crash on starting tf.Session()
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -888,6 +888,9 @@ CudaContext* CUDAExecutor::cuda_context() { return context_; }
// For anything more complicated/prod-focused than this, you'll likely want to
// turn to gsys' topology modeling.
static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
+ // DC - make this clever later. ARM has no NUMA node, just return 0
+ LOG(INFO) << "ARM has no NUMA node, hardcoding to return zero";
+ return 0;
#if defined(__APPLE__)
LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
return 0;
After these changes, build and install! Hope this is useful to some folks.