Tensorflow reads multi csv needs too much time












0















I plan to read the feature data from multi csv files.

Each csv file has 150 columns, and the batch size is 256.



The time cost for just read 1000 iteration needs roughly 12s.

I feel the time cost for that should not be that much, any one here can have some suggestion?



def _parse_csv_row(*vals):
features = tf.convert_to_tensor(vals[0:f_size * 5])
class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
return features, class_label


def get_batch_data(name):
root_path="g:\market\2018-11-12\feature_{}\".format(name)
file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
dataset = tf.contrib.data.CsvDataset(
file_queue,
record_defaults,
buffer_size=1024 * 1024 * 10,
header=True,
na_value='0.0',
select_cols=selected_cols)
dataset = dataset.apply(tf.contrib.data.map_and_batch(
map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
dataset = dataset.prefetch(256 * 1024)
dataset = dataset.repeat()

dataset = dataset.shuffle(buffer_size=32)
iterator = dataset.make_one_shot_iterator()
feature_batch, label_batch = iterator.get_next()
return feature_batch, label_batch

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
a, b = get_batch_data("train")
start_time = time.time()
for x in range(1000):
v = sess.run([a,b])
print(time.time() - start_time)









share|improve this question



























    0















    I plan to read the feature data from multi csv files.

    Each csv file has 150 columns, and the batch size is 256.



    The time cost for just read 1000 iteration needs roughly 12s.

    I feel the time cost for that should not be that much, any one here can have some suggestion?



    def _parse_csv_row(*vals):
    features = tf.convert_to_tensor(vals[0:f_size * 5])
    class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
    return features, class_label


    def get_batch_data(name):
    root_path="g:\market\2018-11-12\feature_{}\".format(name)
    file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
    record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
    selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
    dataset = tf.contrib.data.CsvDataset(
    file_queue,
    record_defaults,
    buffer_size=1024 * 1024 * 10,
    header=True,
    na_value='0.0',
    select_cols=selected_cols)
    dataset = dataset.apply(tf.contrib.data.map_and_batch(
    map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
    dataset = dataset.prefetch(256 * 1024)
    dataset = dataset.repeat()

    dataset = dataset.shuffle(buffer_size=32)
    iterator = dataset.make_one_shot_iterator()
    feature_batch, label_batch = iterator.get_next()
    return feature_batch, label_batch

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    a, b = get_batch_data("train")
    start_time = time.time()
    for x in range(1000):
    v = sess.run([a,b])
    print(time.time() - start_time)









    share|improve this question

























      0












      0








      0








      I plan to read the feature data from multi csv files.

      Each csv file has 150 columns, and the batch size is 256.



      The time cost for just read 1000 iteration needs roughly 12s.

      I feel the time cost for that should not be that much, any one here can have some suggestion?



      def _parse_csv_row(*vals):
      features = tf.convert_to_tensor(vals[0:f_size * 5])
      class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
      return features, class_label


      def get_batch_data(name):
      root_path="g:\market\2018-11-12\feature_{}\".format(name)
      file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
      record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
      selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
      dataset = tf.contrib.data.CsvDataset(
      file_queue,
      record_defaults,
      buffer_size=1024 * 1024 * 10,
      header=True,
      na_value='0.0',
      select_cols=selected_cols)
      dataset = dataset.apply(tf.contrib.data.map_and_batch(
      map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
      dataset = dataset.prefetch(256 * 1024)
      dataset = dataset.repeat()

      dataset = dataset.shuffle(buffer_size=32)
      iterator = dataset.make_one_shot_iterator()
      feature_batch, label_batch = iterator.get_next()
      return feature_batch, label_batch

      config = tf.ConfigProto()
      config.gpu_options.allow_growth = True
      with tf.Session(config=config) as sess:
      sess.run(tf.global_variables_initializer())
      a, b = get_batch_data("train")
      start_time = time.time()
      for x in range(1000):
      v = sess.run([a,b])
      print(time.time() - start_time)









      share|improve this question














      I plan to read the feature data from multi csv files.

      Each csv file has 150 columns, and the batch size is 256.



      The time cost for just read 1000 iteration needs roughly 12s.

      I feel the time cost for that should not be that much, any one here can have some suggestion?



      def _parse_csv_row(*vals):
      features = tf.convert_to_tensor(vals[0:f_size * 5])
      class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
      return features, class_label


      def get_batch_data(name):
      root_path="g:\market\2018-11-12\feature_{}\".format(name)
      file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
      record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
      selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
      dataset = tf.contrib.data.CsvDataset(
      file_queue,
      record_defaults,
      buffer_size=1024 * 1024 * 10,
      header=True,
      na_value='0.0',
      select_cols=selected_cols)
      dataset = dataset.apply(tf.contrib.data.map_and_batch(
      map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
      dataset = dataset.prefetch(256 * 1024)
      dataset = dataset.repeat()

      dataset = dataset.shuffle(buffer_size=32)
      iterator = dataset.make_one_shot_iterator()
      feature_batch, label_batch = iterator.get_next()
      return feature_batch, label_batch

      config = tf.ConfigProto()
      config.gpu_options.allow_growth = True
      with tf.Session(config=config) as sess:
      sess.run(tf.global_variables_initializer())
      a, b = get_batch_data("train")
      start_time = time.time()
      for x in range(1000):
      v = sess.run([a,b])
      print(time.time() - start_time)






      tensorflow tensorflow-datasets






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Nov 26 '18 at 12:34









      CarpemerCarpemer

      415




      415
























          1 Answer
          1






          active

          oldest

          votes


















          1














          The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.



          I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.



          Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.






          share|improve this answer
























            Your Answer






            StackExchange.ifUsing("editor", function () {
            StackExchange.using("externalEditor", function () {
            StackExchange.using("snippets", function () {
            StackExchange.snippets.init();
            });
            });
            }, "code-snippets");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "1"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            autoActivateHeartbeat: false,
            convertImagesToLinks: true,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: 10,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });














            draft saved

            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53481231%2ftensorflow-reads-multi-csv-needs-too-much-time%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes









            1














            The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.



            I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.



            Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.






            share|improve this answer




























              1














              The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.



              I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.



              Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.






              share|improve this answer


























                1












                1








                1







                The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.



                I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.



                Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.






                share|improve this answer













                The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.



                I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.



                Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.







                share|improve this answer












                share|improve this answer



                share|improve this answer










                answered Nov 26 '18 at 14:12









                Vlad-HCVlad-HC

                1,0341015




                1,0341015
































                    draft saved

                    draft discarded




















































                    Thanks for contributing an answer to Stack Overflow!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53481231%2ftensorflow-reads-multi-csv-needs-too-much-time%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    Tonle Sap (See)

                    I get strange results when I access the Sqlitedatabase with Unity C# via XAMPP

                    Guatemaltekische Davis-Cup-Mannschaft