How to bind Theano to BLAS?












0















Background:



I'm trying to teach myself how to program something that can "read" hand-written text, just as a personal challenge. I'm familiar with Python and C++. I'm a bit rusty and out of my element, so progress is slow. Right now I'm trying to bind Theano to BLAS. I'm also new to this site and this area of Python in general so I apologize if I'm not using proper etiquette/formatting or am not providing the right information...I'm learning!



I'm running Python 2.7 on Windows 10.



My issue:



When I fire up a command prompt via Anaconda I input this:



python check_blas.py


and I get



Total execution time: 34.96s on CPU (without direct Theano binding to blas but with numpy/scipy binding to blas).


So I'm trying to get Theano to bind to blas, but I'm not sure what else to try to accomplish this.



Relevant code:



I have a file called check_blas.py, located at C:Users[username] which contains (apologies is this isn't useful):



#!/usr/bin/env python

# print info to check we link with witch version of blas
# test the speed of the blas gemm fct:
# C=a*C+dot(A,B)*b
# A,B,C matrix
# a,b scalar
from __future__ import absolute_import, print_function, division

import os
import sys
import time
from optparse import OptionParser

import numpy as np
import theano
import theano.tensor as T


def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
iters=10, order='C'):
"""
:param execute: If True, execute a Theano function that should call gemm.
:param verbose: If True, will print some Theano flags and env variables.
:param M,N,K: The M,N,K size used by gemm.
:param iters: The number of calls to gemm to do.

:return: a tuple (execution time,
str that represents the implementation used)
"""

if verbose:
print('Some Theano flags:')
print(' blas.ldflags=', theano.config.blas.ldflags)
print(' compiledir=', theano.config.compiledir)
print(' floatX=', theano.config.floatX)
print(' device=', theano.config.device)
print('Some OS information:')
print(' sys.platform=', sys.platform)
print(' sys.version=', sys.version)
print(' sys.prefix=', sys.prefix)
print('Some environment variables:')
print(' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS'))
print(' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS'))
print(' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS'))
print()
print('Numpy config: (used when the Theano flag'
' "blas.ldflags" is empty)')
np.show_config()
print('Numpy dot module:', np.dot.__module__)
print('Numpy location:', np.__file__)
print('Numpy version:', np.__version__)

a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
order=order))
b = theano.shared(np.ones((N, K), dtype=theano.config.floatX,
order=order))
c = theano.shared(np.ones((M, K), dtype=theano.config.floatX,
order=order))
f = theano.function(, updates=[(c, 0.4 * c + .8 * T.dot(a, b))])

if any([x.op.__class__.__name__ == 'Gemm' for x in
f.maker.fgraph.toposort()]):
c_impl = [hasattr(thunk, 'cthunk')
for node, thunk in zip(f.fn.nodes, f.fn.thunks)
if node.op.__class__.__name__ == "Gemm"]
assert len(c_impl) == 1
if c_impl[0]:
impl = 'CPU (with direct Theano binding to blas)'
else:
impl = 'CPU (without direct Theano binding to blas but with numpy/scipy binding to blas)'
elif any([x.op.__class__.__name__ == 'GpuGemm' for x in
f.maker.fgraph.toposort()]):
impl = 'GPU'
else:
impl = 'ERROR, unable to tell if Theano used the cpu or the gpu:n'
impl += str(f.maker.fgraph.toposort())

t0 = 0
t1 = -1

f() # Ignore first function call to get representative time.
if execute:
sync = (hasattr(theano, "gpuarray") and
isinstance(c, theano.gpuarray.GpuArraySharedVariable))
if sync:
# Make sure we don't include the time from the first call
c.get_value(borrow=True, return_internal_type=True).sync()
t0 = time.time()
for i in range(iters):
f()
if sync:
c.get_value(borrow=True, return_internal_type=True).sync()
t1 = time.time()
return t1 - t0, impl


def jobman_job(state, channel):
execute()
return channel.COMPLETE


def test():
return execute()


parser = OptionParser(
usage='%prog <options>nCompute time needed to perform BLAS gemm '
'computations between matrices of size (M, N) and (N, K).')

parser.add_option('-q', '--quiet', action='store_true', dest='quiet',
default=False,
help="If true, do not print the comparison table and config "
"options")
parser.add_option('--print_only', action='store_true', dest='print_only',
default=False,
help="If true, do not perform gemm computations")
parser.add_option('-M', '--M', action='store', dest='M',
default=0, type="int",
help="The M size to gemm")
parser.add_option('-N', '--N', action='store', dest='N',
default=0, type="int",
help="The N size to gemm")
parser.add_option('-K', '--K', action='store', dest='K',
default=0, type="int",
help="The K size to gemm")
parser.add_option('--iter', action='store', dest='iter',
default=10, type="int",
help="The number of calls to gemm")
parser.add_option('--order', action='store', dest='order',
default="C",
help="The numpy memory layout parameter used when creating"
" the numpy.ndarray objects. It accepts 'C' for C memory"
" order and 'F' for Fortran order (for all matrices).")
parser.add_option('-B', '--B', action='store', dest='B',
default=5000, type="int",
help="The M, N, and K for big gemm")


if __name__ == "__main__":
options, arguments = parser.parse_args(sys.argv)

if hasattr(options, "help"):
print(options.help)
sys.exit(0)

if not options.quiet:
print("""
Some results that you can compare against. They were 10 executions
of gemm in float64 with matrices of shape 2000x2000 (M=N=K=2000).
All memory layout was in C order.

CPU tested: Xeon E5345(2.33Ghz, 8M L2 cache, 1333Mhz FSB),
Xeon E5430(2.66Ghz, 12M L2 cache, 1333Mhz FSB),
Xeon E5450(3Ghz, 12M L2 cache, 1333Mhz FSB),
Xeon X5560(2.8Ghz, 12M L2 cache, hyper-threads?)
Core 2 E8500, Core i7 930(2.8Ghz, hyper-threads enabled),
Core i7 950(3.07GHz, hyper-threads enabled)
Xeon X5550(2.67GHz, 8M l2 cache?, hyper-threads enabled)


Libraries tested:
* numpy with ATLAS from distribution (FC9) package (1 thread)
* manually compiled numpy and ATLAS with 2 threads
* goto 1.26 with 1, 2, 4 and 8 threads
* goto2 1.13 compiled with multiple threads enabled

Xeon Xeon Xeon Core2 i7 i7 Xeon Xeon
lib/nb threads E5345 E5430 E5450 E8500 930 950 X5560 X5550

numpy 1.3.0 blas 775.92s
numpy_FC9_atlas/1 39.2s 35.0s 30.7s 29.6s 21.5s 19.60s
goto/1 18.7s 16.1s 14.2s 13.7s 16.1s 14.67s
numpy_MAN_atlas/2 12.0s 11.6s 10.2s 9.2s 9.0s
goto/2 9.5s 8.1s 7.1s 7.3s 8.1s 7.4s
goto/4 4.9s 4.4s 3.7s - 4.1s 3.8s
goto/8 2.7s 2.4s 2.0s - 4.1s 3.8s
openblas/1 14.04s
openblas/2 7.16s
openblas/4 3.71s
openblas/8 3.70s
mkl 11.0.083/1 7.97s
mkl 10.2.2.025/1 13.7s
mkl 10.2.2.025/2 7.6s
mkl 10.2.2.025/4 4.0s
mkl 10.2.2.025/8 2.0s
goto2 1.13/1 14.37s
goto2 1.13/2 7.26s
goto2 1.13/4 3.70s
goto2 1.13/8 1.94s
goto2 1.13/16 3.16s

Test time in float32. There were 10 executions of gemm in
float32 with matrices of shape 5000x5000 (M=N=K=5000)
All memory layout was in C order.


cuda version 8.0 7.5 7.0
gpu
M40 0.45s 0.47s
k80 0.92s 0.96s
K6000/NOECC 0.71s 0.69s
P6000/NOECC 0.25s

Titan X (Pascal) 0.28s
GTX Titan X 0.45s 0.45s 0.47s
GTX Titan Black 0.66s 0.64s 0.64s
GTX 1080 0.35s
GTX 980 Ti 0.41s
GTX 970 0.66s
GTX 680 1.57s
GTX 750 Ti 2.01s 2.01s
GTX 750 2.46s 2.37s
GTX 660 2.32s 2.32s
GTX 580 2.42s
GTX 480 2.87s
TX1 7.6s (float32 storage and computation)
GT 610 33.5s
""")

if options.M == 0:
M = options.B
else:
M = options.M
if options.N == 0:
N = options.B
else:
N = options.N
if options.K == 0:
K = options.B
else:
K = options.K

t, impl = execute(not options.print_only, not options.quiet,
M=M, N=N, K=K, iters=options.iter,
order=options.order)

if options.print_only:
pass
elif options.quiet:
print(t)
else:
print()
print("We executed", options.iter, end=' ')
print("calls to gemm with a and b matrices of shapes", end=' ')
print("(%d, %d) and (%d, %d)." % (M, N, N, K))

print()
print('Total execution time: %.2fs on %s.' % (t, impl))
print()
print('Try to run this script a few times. Experience shows that'
' the first time is not as fast as followings calls. The'
' difference is not big, but consistent.')


I also have a file called .theanorc in the same directory, containing:



[global]
floatx = float32

[blas]
ldflags = -LC:Anaconda3Librarybin -lmkl_r


What I've tried



I've tried moving .theanorc to its own folder at C:Users[username].theanorc but that didn't fix the issue (everything I've read so far has been pretty ambiguous as to whether or not to put .theanorc in its own folder or not)










share|improve this question



























    0















    Background:



    I'm trying to teach myself how to program something that can "read" hand-written text, just as a personal challenge. I'm familiar with Python and C++. I'm a bit rusty and out of my element, so progress is slow. Right now I'm trying to bind Theano to BLAS. I'm also new to this site and this area of Python in general so I apologize if I'm not using proper etiquette/formatting or am not providing the right information...I'm learning!



    I'm running Python 2.7 on Windows 10.



    My issue:



    When I fire up a command prompt via Anaconda I input this:



    python check_blas.py


    and I get



    Total execution time: 34.96s on CPU (without direct Theano binding to blas but with numpy/scipy binding to blas).


    So I'm trying to get Theano to bind to blas, but I'm not sure what else to try to accomplish this.



    Relevant code:



    I have a file called check_blas.py, located at C:Users[username] which contains (apologies is this isn't useful):



    #!/usr/bin/env python

    # print info to check we link with witch version of blas
    # test the speed of the blas gemm fct:
    # C=a*C+dot(A,B)*b
    # A,B,C matrix
    # a,b scalar
    from __future__ import absolute_import, print_function, division

    import os
    import sys
    import time
    from optparse import OptionParser

    import numpy as np
    import theano
    import theano.tensor as T


    def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
    iters=10, order='C'):
    """
    :param execute: If True, execute a Theano function that should call gemm.
    :param verbose: If True, will print some Theano flags and env variables.
    :param M,N,K: The M,N,K size used by gemm.
    :param iters: The number of calls to gemm to do.

    :return: a tuple (execution time,
    str that represents the implementation used)
    """

    if verbose:
    print('Some Theano flags:')
    print(' blas.ldflags=', theano.config.blas.ldflags)
    print(' compiledir=', theano.config.compiledir)
    print(' floatX=', theano.config.floatX)
    print(' device=', theano.config.device)
    print('Some OS information:')
    print(' sys.platform=', sys.platform)
    print(' sys.version=', sys.version)
    print(' sys.prefix=', sys.prefix)
    print('Some environment variables:')
    print(' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS'))
    print(' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS'))
    print(' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS'))
    print()
    print('Numpy config: (used when the Theano flag'
    ' "blas.ldflags" is empty)')
    np.show_config()
    print('Numpy dot module:', np.dot.__module__)
    print('Numpy location:', np.__file__)
    print('Numpy version:', np.__version__)

    a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
    order=order))
    b = theano.shared(np.ones((N, K), dtype=theano.config.floatX,
    order=order))
    c = theano.shared(np.ones((M, K), dtype=theano.config.floatX,
    order=order))
    f = theano.function(, updates=[(c, 0.4 * c + .8 * T.dot(a, b))])

    if any([x.op.__class__.__name__ == 'Gemm' for x in
    f.maker.fgraph.toposort()]):
    c_impl = [hasattr(thunk, 'cthunk')
    for node, thunk in zip(f.fn.nodes, f.fn.thunks)
    if node.op.__class__.__name__ == "Gemm"]
    assert len(c_impl) == 1
    if c_impl[0]:
    impl = 'CPU (with direct Theano binding to blas)'
    else:
    impl = 'CPU (without direct Theano binding to blas but with numpy/scipy binding to blas)'
    elif any([x.op.__class__.__name__ == 'GpuGemm' for x in
    f.maker.fgraph.toposort()]):
    impl = 'GPU'
    else:
    impl = 'ERROR, unable to tell if Theano used the cpu or the gpu:n'
    impl += str(f.maker.fgraph.toposort())

    t0 = 0
    t1 = -1

    f() # Ignore first function call to get representative time.
    if execute:
    sync = (hasattr(theano, "gpuarray") and
    isinstance(c, theano.gpuarray.GpuArraySharedVariable))
    if sync:
    # Make sure we don't include the time from the first call
    c.get_value(borrow=True, return_internal_type=True).sync()
    t0 = time.time()
    for i in range(iters):
    f()
    if sync:
    c.get_value(borrow=True, return_internal_type=True).sync()
    t1 = time.time()
    return t1 - t0, impl


    def jobman_job(state, channel):
    execute()
    return channel.COMPLETE


    def test():
    return execute()


    parser = OptionParser(
    usage='%prog <options>nCompute time needed to perform BLAS gemm '
    'computations between matrices of size (M, N) and (N, K).')

    parser.add_option('-q', '--quiet', action='store_true', dest='quiet',
    default=False,
    help="If true, do not print the comparison table and config "
    "options")
    parser.add_option('--print_only', action='store_true', dest='print_only',
    default=False,
    help="If true, do not perform gemm computations")
    parser.add_option('-M', '--M', action='store', dest='M',
    default=0, type="int",
    help="The M size to gemm")
    parser.add_option('-N', '--N', action='store', dest='N',
    default=0, type="int",
    help="The N size to gemm")
    parser.add_option('-K', '--K', action='store', dest='K',
    default=0, type="int",
    help="The K size to gemm")
    parser.add_option('--iter', action='store', dest='iter',
    default=10, type="int",
    help="The number of calls to gemm")
    parser.add_option('--order', action='store', dest='order',
    default="C",
    help="The numpy memory layout parameter used when creating"
    " the numpy.ndarray objects. It accepts 'C' for C memory"
    " order and 'F' for Fortran order (for all matrices).")
    parser.add_option('-B', '--B', action='store', dest='B',
    default=5000, type="int",
    help="The M, N, and K for big gemm")


    if __name__ == "__main__":
    options, arguments = parser.parse_args(sys.argv)

    if hasattr(options, "help"):
    print(options.help)
    sys.exit(0)

    if not options.quiet:
    print("""
    Some results that you can compare against. They were 10 executions
    of gemm in float64 with matrices of shape 2000x2000 (M=N=K=2000).
    All memory layout was in C order.

    CPU tested: Xeon E5345(2.33Ghz, 8M L2 cache, 1333Mhz FSB),
    Xeon E5430(2.66Ghz, 12M L2 cache, 1333Mhz FSB),
    Xeon E5450(3Ghz, 12M L2 cache, 1333Mhz FSB),
    Xeon X5560(2.8Ghz, 12M L2 cache, hyper-threads?)
    Core 2 E8500, Core i7 930(2.8Ghz, hyper-threads enabled),
    Core i7 950(3.07GHz, hyper-threads enabled)
    Xeon X5550(2.67GHz, 8M l2 cache?, hyper-threads enabled)


    Libraries tested:
    * numpy with ATLAS from distribution (FC9) package (1 thread)
    * manually compiled numpy and ATLAS with 2 threads
    * goto 1.26 with 1, 2, 4 and 8 threads
    * goto2 1.13 compiled with multiple threads enabled

    Xeon Xeon Xeon Core2 i7 i7 Xeon Xeon
    lib/nb threads E5345 E5430 E5450 E8500 930 950 X5560 X5550

    numpy 1.3.0 blas 775.92s
    numpy_FC9_atlas/1 39.2s 35.0s 30.7s 29.6s 21.5s 19.60s
    goto/1 18.7s 16.1s 14.2s 13.7s 16.1s 14.67s
    numpy_MAN_atlas/2 12.0s 11.6s 10.2s 9.2s 9.0s
    goto/2 9.5s 8.1s 7.1s 7.3s 8.1s 7.4s
    goto/4 4.9s 4.4s 3.7s - 4.1s 3.8s
    goto/8 2.7s 2.4s 2.0s - 4.1s 3.8s
    openblas/1 14.04s
    openblas/2 7.16s
    openblas/4 3.71s
    openblas/8 3.70s
    mkl 11.0.083/1 7.97s
    mkl 10.2.2.025/1 13.7s
    mkl 10.2.2.025/2 7.6s
    mkl 10.2.2.025/4 4.0s
    mkl 10.2.2.025/8 2.0s
    goto2 1.13/1 14.37s
    goto2 1.13/2 7.26s
    goto2 1.13/4 3.70s
    goto2 1.13/8 1.94s
    goto2 1.13/16 3.16s

    Test time in float32. There were 10 executions of gemm in
    float32 with matrices of shape 5000x5000 (M=N=K=5000)
    All memory layout was in C order.


    cuda version 8.0 7.5 7.0
    gpu
    M40 0.45s 0.47s
    k80 0.92s 0.96s
    K6000/NOECC 0.71s 0.69s
    P6000/NOECC 0.25s

    Titan X (Pascal) 0.28s
    GTX Titan X 0.45s 0.45s 0.47s
    GTX Titan Black 0.66s 0.64s 0.64s
    GTX 1080 0.35s
    GTX 980 Ti 0.41s
    GTX 970 0.66s
    GTX 680 1.57s
    GTX 750 Ti 2.01s 2.01s
    GTX 750 2.46s 2.37s
    GTX 660 2.32s 2.32s
    GTX 580 2.42s
    GTX 480 2.87s
    TX1 7.6s (float32 storage and computation)
    GT 610 33.5s
    """)

    if options.M == 0:
    M = options.B
    else:
    M = options.M
    if options.N == 0:
    N = options.B
    else:
    N = options.N
    if options.K == 0:
    K = options.B
    else:
    K = options.K

    t, impl = execute(not options.print_only, not options.quiet,
    M=M, N=N, K=K, iters=options.iter,
    order=options.order)

    if options.print_only:
    pass
    elif options.quiet:
    print(t)
    else:
    print()
    print("We executed", options.iter, end=' ')
    print("calls to gemm with a and b matrices of shapes", end=' ')
    print("(%d, %d) and (%d, %d)." % (M, N, N, K))

    print()
    print('Total execution time: %.2fs on %s.' % (t, impl))
    print()
    print('Try to run this script a few times. Experience shows that'
    ' the first time is not as fast as followings calls. The'
    ' difference is not big, but consistent.')


    I also have a file called .theanorc in the same directory, containing:



    [global]
    floatx = float32

    [blas]
    ldflags = -LC:Anaconda3Librarybin -lmkl_r


    What I've tried



    I've tried moving .theanorc to its own folder at C:Users[username].theanorc but that didn't fix the issue (everything I've read so far has been pretty ambiguous as to whether or not to put .theanorc in its own folder or not)










    share|improve this question

























      0












      0








      0


      1






      Background:



      I'm trying to teach myself how to program something that can "read" hand-written text, just as a personal challenge. I'm familiar with Python and C++. I'm a bit rusty and out of my element, so progress is slow. Right now I'm trying to bind Theano to BLAS. I'm also new to this site and this area of Python in general so I apologize if I'm not using proper etiquette/formatting or am not providing the right information...I'm learning!



      I'm running Python 2.7 on Windows 10.



      My issue:



      When I fire up a command prompt via Anaconda I input this:



      python check_blas.py


      and I get



      Total execution time: 34.96s on CPU (without direct Theano binding to blas but with numpy/scipy binding to blas).


      So I'm trying to get Theano to bind to blas, but I'm not sure what else to try to accomplish this.



      Relevant code:



      I have a file called check_blas.py, located at C:Users[username] which contains (apologies is this isn't useful):



      #!/usr/bin/env python

      # print info to check we link with witch version of blas
      # test the speed of the blas gemm fct:
      # C=a*C+dot(A,B)*b
      # A,B,C matrix
      # a,b scalar
      from __future__ import absolute_import, print_function, division

      import os
      import sys
      import time
      from optparse import OptionParser

      import numpy as np
      import theano
      import theano.tensor as T


      def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
      iters=10, order='C'):
      """
      :param execute: If True, execute a Theano function that should call gemm.
      :param verbose: If True, will print some Theano flags and env variables.
      :param M,N,K: The M,N,K size used by gemm.
      :param iters: The number of calls to gemm to do.

      :return: a tuple (execution time,
      str that represents the implementation used)
      """

      if verbose:
      print('Some Theano flags:')
      print(' blas.ldflags=', theano.config.blas.ldflags)
      print(' compiledir=', theano.config.compiledir)
      print(' floatX=', theano.config.floatX)
      print(' device=', theano.config.device)
      print('Some OS information:')
      print(' sys.platform=', sys.platform)
      print(' sys.version=', sys.version)
      print(' sys.prefix=', sys.prefix)
      print('Some environment variables:')
      print(' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS'))
      print(' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS'))
      print(' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS'))
      print()
      print('Numpy config: (used when the Theano flag'
      ' "blas.ldflags" is empty)')
      np.show_config()
      print('Numpy dot module:', np.dot.__module__)
      print('Numpy location:', np.__file__)
      print('Numpy version:', np.__version__)

      a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
      order=order))
      b = theano.shared(np.ones((N, K), dtype=theano.config.floatX,
      order=order))
      c = theano.shared(np.ones((M, K), dtype=theano.config.floatX,
      order=order))
      f = theano.function(, updates=[(c, 0.4 * c + .8 * T.dot(a, b))])

      if any([x.op.__class__.__name__ == 'Gemm' for x in
      f.maker.fgraph.toposort()]):
      c_impl = [hasattr(thunk, 'cthunk')
      for node, thunk in zip(f.fn.nodes, f.fn.thunks)
      if node.op.__class__.__name__ == "Gemm"]
      assert len(c_impl) == 1
      if c_impl[0]:
      impl = 'CPU (with direct Theano binding to blas)'
      else:
      impl = 'CPU (without direct Theano binding to blas but with numpy/scipy binding to blas)'
      elif any([x.op.__class__.__name__ == 'GpuGemm' for x in
      f.maker.fgraph.toposort()]):
      impl = 'GPU'
      else:
      impl = 'ERROR, unable to tell if Theano used the cpu or the gpu:n'
      impl += str(f.maker.fgraph.toposort())

      t0 = 0
      t1 = -1

      f() # Ignore first function call to get representative time.
      if execute:
      sync = (hasattr(theano, "gpuarray") and
      isinstance(c, theano.gpuarray.GpuArraySharedVariable))
      if sync:
      # Make sure we don't include the time from the first call
      c.get_value(borrow=True, return_internal_type=True).sync()
      t0 = time.time()
      for i in range(iters):
      f()
      if sync:
      c.get_value(borrow=True, return_internal_type=True).sync()
      t1 = time.time()
      return t1 - t0, impl


      def jobman_job(state, channel):
      execute()
      return channel.COMPLETE


      def test():
      return execute()


      parser = OptionParser(
      usage='%prog <options>nCompute time needed to perform BLAS gemm '
      'computations between matrices of size (M, N) and (N, K).')

      parser.add_option('-q', '--quiet', action='store_true', dest='quiet',
      default=False,
      help="If true, do not print the comparison table and config "
      "options")
      parser.add_option('--print_only', action='store_true', dest='print_only',
      default=False,
      help="If true, do not perform gemm computations")
      parser.add_option('-M', '--M', action='store', dest='M',
      default=0, type="int",
      help="The M size to gemm")
      parser.add_option('-N', '--N', action='store', dest='N',
      default=0, type="int",
      help="The N size to gemm")
      parser.add_option('-K', '--K', action='store', dest='K',
      default=0, type="int",
      help="The K size to gemm")
      parser.add_option('--iter', action='store', dest='iter',
      default=10, type="int",
      help="The number of calls to gemm")
      parser.add_option('--order', action='store', dest='order',
      default="C",
      help="The numpy memory layout parameter used when creating"
      " the numpy.ndarray objects. It accepts 'C' for C memory"
      " order and 'F' for Fortran order (for all matrices).")
      parser.add_option('-B', '--B', action='store', dest='B',
      default=5000, type="int",
      help="The M, N, and K for big gemm")


      if __name__ == "__main__":
      options, arguments = parser.parse_args(sys.argv)

      if hasattr(options, "help"):
      print(options.help)
      sys.exit(0)

      if not options.quiet:
      print("""
      Some results that you can compare against. They were 10 executions
      of gemm in float64 with matrices of shape 2000x2000 (M=N=K=2000).
      All memory layout was in C order.

      CPU tested: Xeon E5345(2.33Ghz, 8M L2 cache, 1333Mhz FSB),
      Xeon E5430(2.66Ghz, 12M L2 cache, 1333Mhz FSB),
      Xeon E5450(3Ghz, 12M L2 cache, 1333Mhz FSB),
      Xeon X5560(2.8Ghz, 12M L2 cache, hyper-threads?)
      Core 2 E8500, Core i7 930(2.8Ghz, hyper-threads enabled),
      Core i7 950(3.07GHz, hyper-threads enabled)
      Xeon X5550(2.67GHz, 8M l2 cache?, hyper-threads enabled)


      Libraries tested:
      * numpy with ATLAS from distribution (FC9) package (1 thread)
      * manually compiled numpy and ATLAS with 2 threads
      * goto 1.26 with 1, 2, 4 and 8 threads
      * goto2 1.13 compiled with multiple threads enabled

      Xeon Xeon Xeon Core2 i7 i7 Xeon Xeon
      lib/nb threads E5345 E5430 E5450 E8500 930 950 X5560 X5550

      numpy 1.3.0 blas 775.92s
      numpy_FC9_atlas/1 39.2s 35.0s 30.7s 29.6s 21.5s 19.60s
      goto/1 18.7s 16.1s 14.2s 13.7s 16.1s 14.67s
      numpy_MAN_atlas/2 12.0s 11.6s 10.2s 9.2s 9.0s
      goto/2 9.5s 8.1s 7.1s 7.3s 8.1s 7.4s
      goto/4 4.9s 4.4s 3.7s - 4.1s 3.8s
      goto/8 2.7s 2.4s 2.0s - 4.1s 3.8s
      openblas/1 14.04s
      openblas/2 7.16s
      openblas/4 3.71s
      openblas/8 3.70s
      mkl 11.0.083/1 7.97s
      mkl 10.2.2.025/1 13.7s
      mkl 10.2.2.025/2 7.6s
      mkl 10.2.2.025/4 4.0s
      mkl 10.2.2.025/8 2.0s
      goto2 1.13/1 14.37s
      goto2 1.13/2 7.26s
      goto2 1.13/4 3.70s
      goto2 1.13/8 1.94s
      goto2 1.13/16 3.16s

      Test time in float32. There were 10 executions of gemm in
      float32 with matrices of shape 5000x5000 (M=N=K=5000)
      All memory layout was in C order.


      cuda version 8.0 7.5 7.0
      gpu
      M40 0.45s 0.47s
      k80 0.92s 0.96s
      K6000/NOECC 0.71s 0.69s
      P6000/NOECC 0.25s

      Titan X (Pascal) 0.28s
      GTX Titan X 0.45s 0.45s 0.47s
      GTX Titan Black 0.66s 0.64s 0.64s
      GTX 1080 0.35s
      GTX 980 Ti 0.41s
      GTX 970 0.66s
      GTX 680 1.57s
      GTX 750 Ti 2.01s 2.01s
      GTX 750 2.46s 2.37s
      GTX 660 2.32s 2.32s
      GTX 580 2.42s
      GTX 480 2.87s
      TX1 7.6s (float32 storage and computation)
      GT 610 33.5s
      """)

      if options.M == 0:
      M = options.B
      else:
      M = options.M
      if options.N == 0:
      N = options.B
      else:
      N = options.N
      if options.K == 0:
      K = options.B
      else:
      K = options.K

      t, impl = execute(not options.print_only, not options.quiet,
      M=M, N=N, K=K, iters=options.iter,
      order=options.order)

      if options.print_only:
      pass
      elif options.quiet:
      print(t)
      else:
      print()
      print("We executed", options.iter, end=' ')
      print("calls to gemm with a and b matrices of shapes", end=' ')
      print("(%d, %d) and (%d, %d)." % (M, N, N, K))

      print()
      print('Total execution time: %.2fs on %s.' % (t, impl))
      print()
      print('Try to run this script a few times. Experience shows that'
      ' the first time is not as fast as followings calls. The'
      ' difference is not big, but consistent.')


      I also have a file called .theanorc in the same directory, containing:



      [global]
      floatx = float32

      [blas]
      ldflags = -LC:Anaconda3Librarybin -lmkl_r


      What I've tried



      I've tried moving .theanorc to its own folder at C:Users[username].theanorc but that didn't fix the issue (everything I've read so far has been pretty ambiguous as to whether or not to put .theanorc in its own folder or not)










      share|improve this question














      Background:



      I'm trying to teach myself how to program something that can "read" hand-written text, just as a personal challenge. I'm familiar with Python and C++. I'm a bit rusty and out of my element, so progress is slow. Right now I'm trying to bind Theano to BLAS. I'm also new to this site and this area of Python in general so I apologize if I'm not using proper etiquette/formatting or am not providing the right information...I'm learning!



      I'm running Python 2.7 on Windows 10.



      My issue:



      When I fire up a command prompt via Anaconda I input this:



      python check_blas.py


      and I get



      Total execution time: 34.96s on CPU (without direct Theano binding to blas but with numpy/scipy binding to blas).


      So I'm trying to get Theano to bind to blas, but I'm not sure what else to try to accomplish this.



      Relevant code:



      I have a file called check_blas.py, located at C:Users[username] which contains (apologies is this isn't useful):



      #!/usr/bin/env python

      # print info to check we link with witch version of blas
      # test the speed of the blas gemm fct:
      # C=a*C+dot(A,B)*b
      # A,B,C matrix
      # a,b scalar
      from __future__ import absolute_import, print_function, division

      import os
      import sys
      import time
      from optparse import OptionParser

      import numpy as np
      import theano
      import theano.tensor as T


      def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
      iters=10, order='C'):
      """
      :param execute: If True, execute a Theano function that should call gemm.
      :param verbose: If True, will print some Theano flags and env variables.
      :param M,N,K: The M,N,K size used by gemm.
      :param iters: The number of calls to gemm to do.

      :return: a tuple (execution time,
      str that represents the implementation used)
      """

      if verbose:
      print('Some Theano flags:')
      print(' blas.ldflags=', theano.config.blas.ldflags)
      print(' compiledir=', theano.config.compiledir)
      print(' floatX=', theano.config.floatX)
      print(' device=', theano.config.device)
      print('Some OS information:')
      print(' sys.platform=', sys.platform)
      print(' sys.version=', sys.version)
      print(' sys.prefix=', sys.prefix)
      print('Some environment variables:')
      print(' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS'))
      print(' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS'))
      print(' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS'))
      print()
      print('Numpy config: (used when the Theano flag'
      ' "blas.ldflags" is empty)')
      np.show_config()
      print('Numpy dot module:', np.dot.__module__)
      print('Numpy location:', np.__file__)
      print('Numpy version:', np.__version__)

      a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
      order=order))
      b = theano.shared(np.ones((N, K), dtype=theano.config.floatX,
      order=order))
      c = theano.shared(np.ones((M, K), dtype=theano.config.floatX,
      order=order))
      f = theano.function(, updates=[(c, 0.4 * c + .8 * T.dot(a, b))])

      if any([x.op.__class__.__name__ == 'Gemm' for x in
      f.maker.fgraph.toposort()]):
      c_impl = [hasattr(thunk, 'cthunk')
      for node, thunk in zip(f.fn.nodes, f.fn.thunks)
      if node.op.__class__.__name__ == "Gemm"]
      assert len(c_impl) == 1
      if c_impl[0]:
      impl = 'CPU (with direct Theano binding to blas)'
      else:
      impl = 'CPU (without direct Theano binding to blas but with numpy/scipy binding to blas)'
      elif any([x.op.__class__.__name__ == 'GpuGemm' for x in
      f.maker.fgraph.toposort()]):
      impl = 'GPU'
      else:
      impl = 'ERROR, unable to tell if Theano used the cpu or the gpu:n'
      impl += str(f.maker.fgraph.toposort())

      t0 = 0
      t1 = -1

      f() # Ignore first function call to get representative time.
      if execute:
      sync = (hasattr(theano, "gpuarray") and
      isinstance(c, theano.gpuarray.GpuArraySharedVariable))
      if sync:
      # Make sure we don't include the time from the first call
      c.get_value(borrow=True, return_internal_type=True).sync()
      t0 = time.time()
      for i in range(iters):
      f()
      if sync:
      c.get_value(borrow=True, return_internal_type=True).sync()
      t1 = time.time()
      return t1 - t0, impl


      def jobman_job(state, channel):
      execute()
      return channel.COMPLETE


      def test():
      return execute()


      parser = OptionParser(
      usage='%prog <options>nCompute time needed to perform BLAS gemm '
      'computations between matrices of size (M, N) and (N, K).')

      parser.add_option('-q', '--quiet', action='store_true', dest='quiet',
      default=False,
      help="If true, do not print the comparison table and config "
      "options")
      parser.add_option('--print_only', action='store_true', dest='print_only',
      default=False,
      help="If true, do not perform gemm computations")
      parser.add_option('-M', '--M', action='store', dest='M',
      default=0, type="int",
      help="The M size to gemm")
      parser.add_option('-N', '--N', action='store', dest='N',
      default=0, type="int",
      help="The N size to gemm")
      parser.add_option('-K', '--K', action='store', dest='K',
      default=0, type="int",
      help="The K size to gemm")
      parser.add_option('--iter', action='store', dest='iter',
      default=10, type="int",
      help="The number of calls to gemm")
      parser.add_option('--order', action='store', dest='order',
      default="C",
      help="The numpy memory layout parameter used when creating"
      " the numpy.ndarray objects. It accepts 'C' for C memory"
      " order and 'F' for Fortran order (for all matrices).")
      parser.add_option('-B', '--B', action='store', dest='B',
      default=5000, type="int",
      help="The M, N, and K for big gemm")


      if __name__ == "__main__":
      options, arguments = parser.parse_args(sys.argv)

      if hasattr(options, "help"):
      print(options.help)
      sys.exit(0)

      if not options.quiet:
      print("""
      Some results that you can compare against. They were 10 executions
      of gemm in float64 with matrices of shape 2000x2000 (M=N=K=2000).
      All memory layout was in C order.

      CPU tested: Xeon E5345(2.33Ghz, 8M L2 cache, 1333Mhz FSB),
      Xeon E5430(2.66Ghz, 12M L2 cache, 1333Mhz FSB),
      Xeon E5450(3Ghz, 12M L2 cache, 1333Mhz FSB),
      Xeon X5560(2.8Ghz, 12M L2 cache, hyper-threads?)
      Core 2 E8500, Core i7 930(2.8Ghz, hyper-threads enabled),
      Core i7 950(3.07GHz, hyper-threads enabled)
      Xeon X5550(2.67GHz, 8M l2 cache?, hyper-threads enabled)


      Libraries tested:
      * numpy with ATLAS from distribution (FC9) package (1 thread)
      * manually compiled numpy and ATLAS with 2 threads
      * goto 1.26 with 1, 2, 4 and 8 threads
      * goto2 1.13 compiled with multiple threads enabled

      Xeon Xeon Xeon Core2 i7 i7 Xeon Xeon
      lib/nb threads E5345 E5430 E5450 E8500 930 950 X5560 X5550

      numpy 1.3.0 blas 775.92s
      numpy_FC9_atlas/1 39.2s 35.0s 30.7s 29.6s 21.5s 19.60s
      goto/1 18.7s 16.1s 14.2s 13.7s 16.1s 14.67s
      numpy_MAN_atlas/2 12.0s 11.6s 10.2s 9.2s 9.0s
      goto/2 9.5s 8.1s 7.1s 7.3s 8.1s 7.4s
      goto/4 4.9s 4.4s 3.7s - 4.1s 3.8s
      goto/8 2.7s 2.4s 2.0s - 4.1s 3.8s
      openblas/1 14.04s
      openblas/2 7.16s
      openblas/4 3.71s
      openblas/8 3.70s
      mkl 11.0.083/1 7.97s
      mkl 10.2.2.025/1 13.7s
      mkl 10.2.2.025/2 7.6s
      mkl 10.2.2.025/4 4.0s
      mkl 10.2.2.025/8 2.0s
      goto2 1.13/1 14.37s
      goto2 1.13/2 7.26s
      goto2 1.13/4 3.70s
      goto2 1.13/8 1.94s
      goto2 1.13/16 3.16s

      Test time in float32. There were 10 executions of gemm in
      float32 with matrices of shape 5000x5000 (M=N=K=5000)
      All memory layout was in C order.


      cuda version 8.0 7.5 7.0
      gpu
      M40 0.45s 0.47s
      k80 0.92s 0.96s
      K6000/NOECC 0.71s 0.69s
      P6000/NOECC 0.25s

      Titan X (Pascal) 0.28s
      GTX Titan X 0.45s 0.45s 0.47s
      GTX Titan Black 0.66s 0.64s 0.64s
      GTX 1080 0.35s
      GTX 980 Ti 0.41s
      GTX 970 0.66s
      GTX 680 1.57s
      GTX 750 Ti 2.01s 2.01s
      GTX 750 2.46s 2.37s
      GTX 660 2.32s 2.32s
      GTX 580 2.42s
      GTX 480 2.87s
      TX1 7.6s (float32 storage and computation)
      GT 610 33.5s
      """)

      if options.M == 0:
      M = options.B
      else:
      M = options.M
      if options.N == 0:
      N = options.B
      else:
      N = options.N
      if options.K == 0:
      K = options.B
      else:
      K = options.K

      t, impl = execute(not options.print_only, not options.quiet,
      M=M, N=N, K=K, iters=options.iter,
      order=options.order)

      if options.print_only:
      pass
      elif options.quiet:
      print(t)
      else:
      print()
      print("We executed", options.iter, end=' ')
      print("calls to gemm with a and b matrices of shapes", end=' ')
      print("(%d, %d) and (%d, %d)." % (M, N, N, K))

      print()
      print('Total execution time: %.2fs on %s.' % (t, impl))
      print()
      print('Try to run this script a few times. Experience shows that'
      ' the first time is not as fast as followings calls. The'
      ' difference is not big, but consistent.')


      I also have a file called .theanorc in the same directory, containing:



      [global]
      floatx = float32

      [blas]
      ldflags = -LC:Anaconda3Librarybin -lmkl_r


      What I've tried



      I've tried moving .theanorc to its own folder at C:Users[username].theanorc but that didn't fix the issue (everything I've read so far has been pretty ambiguous as to whether or not to put .theanorc in its own folder or not)







      python-2.7 windows-10 theano conda blas






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Nov 26 '18 at 3:22









      SnouSnou

      13




      13
























          0






          active

          oldest

          votes











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53474390%2fhow-to-bind-theano-to-blas%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Stack Overflow!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53474390%2fhow-to-bind-theano-to-blas%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Wiesbaden

          Marschland

          Dieringhausen