tensorflow => データの読み込み

CSVファイルの例を数える

import tensorflow as tf
filename_queue = tf.train.string_input_producer(["file.csv"], num_epochs=1)
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
col1, col2 = tf.decode_csv(value, record_defaults=[[0], [0]])

with tf.Session() as sess:
  sess.run(tf.initialize_local_variables())
  tf.train.start_queue_runners()
  num_examples = 0
  try:
    while True:
      c1, c2 = sess.run([col1, col2])
      num_examples += 1
  except tf.errors.OutOfRangeError:
    print "There are", num_examples, "examples"

num_epochs=1は、リストの各ファイルを一度処理した後にstring_input_producerキューを閉じるようにします。それはtry:捕捉されたOutOfRangeErrorを発生させtry: 。デフォルトでは、 string_input_producerはファイル名を無限に生成します。

tf.initialize_local_variables()は、実行時にnum_epoch内のstring_input_producer ローカル変数を初期化するテンソルフローOp string_input_producer 。

tf.train.start_queue_runners()は、キューへのデータの追加を非同期で処理する余分なトレッドを開始します。

TFRecordファイルの読み込みと解析

TFRecordファイルは、データ（テンソル）を格納するためのネイティブテンソルフローバイナリ形式です。このファイルを読み込むには、CSVの例に似たコードを使用します。

import tensorflow as tf
filename_queue = tf.train.string_input_producer(["file.tfrecord"], num_epochs=1)
reader = tf.TFRecordReader()
key, serialized_example = reader.read(filename_queue)

次に、 serialized_example Queueからサンプルを解析する必要があります。あなたはいずれかを使用してそれを行うことができますtf.parse_example前のバッチ処理を必要としますが、である、速いかtf.parse_single_example ：

batch = tf.train.batch([serialized_example], batch_size=100)
parsed_batch = tf.parse_example(batch, features={
  "feature_name_1": tf.FixedLenFeature(shape=[1], tf.int64),
  "feature_name_2": tf.FixedLenFeature(shape=[1], tf.float32)
})

tf.train.batchは、与えられたテンソルの形状[x, y, z]連続する値をテンソルの形状[batch_size, x, y, z]結合します。 features dictは、フィーチャの名前をテンソルフローのフィーチャの定義にマップします。同様の方法でparse_single_exampleを使用します：

parsed_example = tf.parse_single_example(serialized_example, {
  "feature_name_1": tf.FixedLenFeature(shape=[1], tf.int64),
  "feature_name_2": tf.FixedLenFeature(shape=[1], tf.float32)
})

tf.parse_exampleとtf.parse_single_exampleは、テンソルにフィーチャ名を値でマップする辞書を返します。

parse_single_exampleからのバッチ・サンプルをparse_single_exampleするには、dictからテンソルを抽出し、前とtf.train.batchようにtf.train.batchを使用するtf.train.batchあります。

parsed_batch = dict(zip(parsed_example.keys(),
    tf.train.batch(parsed_example.values(), batch_size=100)

前のようにデータを読み込み、 sess.runに評価するすべてのテンソルのリストをsess.runます：

with tf.Session() as sess:
  sess.run(tf.initialize_local_variables())
  tf.train.start_queue_runners()
  try:
    while True:
      data_batch = sess.run(parsed_batch.values())
      # process data
  except tf.errors.OutOfRangeError:
    pass

例をランダムにシャッフルする

ランダムに例をシャッフルするには、使用することができますtf.train.shuffle_batch機能の代わりに、 tf.train.batch次のように、：

parsed_batch = tf.train.shuffle_batch([serialized_example],
    batch_size=100, capacity=1000,
    min_after_dequeue=200)

tf.train.shuffle_batch （同様にtf.train.batch ）を作成tf.Queueし、追加し続けるserialized_examplesそれに。

capacityは、一度にキューに格納できる要素のcapacity測定します。容量が大きいほどメモリ使用量は大きくなりますが、スレッドがそれを埋めるのを待っているためにレイテンシが低下します。

min_after_dequeueは、要素を取得した後にキューに存在する要素の最小数です。 shuffle_batchキューは、要素を完全に一様にシャッフルしているわけではありません。巨大なデータで、メモリにはshuffle_batchいないことを念頭に置いて設計されています。代わりに、 min_after_dequeueとcapacity要素の間で読み込み、メモリに格納し、それらのバッチをランダムに選択します。その後、 min_after_dequeueとcapacity間の数を維持するために、いくつかの要素をmin_after_dequeueしcapacity 。したがって、 min_after_dequeue値がmin_after_dequeueば大きいほど、より多くの要素がランダムになります。 - batch_size要素の選択は、少なくともmin_after_dequeue連続要素から取られることが保証されますが、 capacityが大きくなければならず、最初はキューを埋めるのに時間がかかります。

バッチ処理でnエポックのデータを読み込む

あなたのデータの例がすでにPythonの変数に読み込まれていて、与えられたサイズのバッチでそれをn回読みたいとします：

import numpy as np
import tensorflow as tf
data = np.array([1, 2, 3, 4, 5])
n = 4

バッチでデータをマージするには、ランダムシャフリングを使用することもできますが、 tf.train.batchまたはtf.train.batch_shuffle使用できますが、データ全体をn回生成するテンソルを渡す必要があります。

limited_tensor = tf.train.limit_epochs(data, n)
batch = tf.train.shuffle_batch([limited_tensor], batch_size=3, enqueue_many=True, capacity=4)

limit_epochsは、numpy配列をフードの下でテンソルに変換し、それをn回生成したテンソルを返し、その後にOutOfRangeErrorをスローします。 enqueue_many=Trueに渡される引数shuffle_batchテンソルリストの各テンソルことを示す[limited_tensor]例の数を含むものとして解釈されるべきです。バッチ処理キューの容量は、テンソル内のサンプル数よりも少なくてもよいことに注意してください。

いつものようにデータを処理することができます：

with tf.Session() as sess:
  sess.run(tf.initialize_local_variables())
  tf.train.start_queue_runners()
  try:
    while True:
      data_batch = sess.run(batch)
      # process data
  except tf.errors.OutOfRangeError:
    pass

TXTファイルから画像とラベルを読み込む方法

Tensorflowのドキュメントでは、TXTファイルからイメージとラベルを直接ロードする方法について説明していません。以下のコードは私がどのようにそれを達成したかを示していますしかし、それがそれを行う最善の方法であるということを意味するものではなく、この方法はさらなるステップで役立ちます。

たとえば、ドキュメントでは1つのホットベクトル[0,1]を使用している間に、ラベルを1つの整数値{0,1}にロードしています。

# Learning how to import images and labels from a TXT file
#
# TXT file format
#
# path/to/imagefile_1 label_1
# path/to/imagefile_2 label_2
# ...                 ...
#
# where label_X is either {0,1}

#Importing Libraries
import os
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes

#File containing the path to images and the labels [path/to/images label]
filename = '/path/to/List.txt'

#Lists where to store the paths and labels
filenames = []
labels = []

#Reading file and extracting paths and labels
with open(filename, 'r') as File:
    infoFile = File.readlines() #Reading all the lines from File
    for line in infoFile: #Reading line-by-line
        words = line.split() #Splitting lines in words using space character as separator
        filenames.append(words[0])
        labels.append(int(words[1]))

NumFiles = len(filenames)

#Converting filenames and labels into tensors
tfilenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
tlabels = ops.convert_to_tensor(labels, dtype=dtypes.int32)

#Creating a queue which contains the list of files to read and the value of the labels
filename_queue = tf.train.slice_input_producer([tfilenames, tlabels], num_epochs=10, shuffle=True, capacity=NumFiles)

#Reading the image files and decoding them
rawIm= tf.read_file(filename_queue[0])
decodedIm = tf.image.decode_png(rawIm) # png or jpg decoder

#Extracting the labels queue
label_queue = filename_queue[1]

#Initializing Global and Local Variables so we avoid warnings and errors
init_op = tf.group(tf.local_variables_initializer() ,tf.global_variables_initializer())

#Creating an InteractiveSession so we can run in iPython
sess = tf.InteractiveSession()

with sess.as_default():
    sess.run(init_op)
    
    # Start populating the filename queue.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(NumFiles): #length of your filenames list
        nm, image, lb = sess.run([filename_queue[0], decodedIm, label_queue])
        
        print image.shape
        print nm
        print lb
        
        #Showing the current image
        plt.imshow(image)
        plt.show()

    coord.request_stop()
    coord.join(threads)

Modified text is an extract of the original Stack Overflow Documentation

ライセンスを受けた CC BY-SA 3.0

所属していない Stack Overflow

tensorflow
データの読み込み

サーチ…

CSVファイルの例を数える

TFRecordファイルの読み込みと解析

例をランダムにシャッフルする

バッチ処理でnエポックのデータを読み込む

TXTファイルから画像とラベルを読み込む方法