Stream S3 gzipped csv file in python without downloading
If a file is in s3 and gunzipped, we can easily stream it and read line by line in python.
Boto has problem. The read method redownloads the key if you call it after the key has been completely read once (compare the read and next methods to see the difference).
So i have used a ReadOnce class to mitigate this. ReadOnce implementation
Also when i was testing, the first column had a \ueff in the column name. I mitigated this using ‘utf-8-sig’ as the encoding. It might not be needed. ueff problem
from boto.s3.connection import S3Connection
import gzip
import csv
import io
class ReadOnce(object):
def __init__(self, k):
self.key = k
self.has_read_once = False
def read(self, size=0):
if self.has_read_once:
return b''
data = self.key.read(size)
if not data:
self.has_read_once = True
return data
class ReadFromS3:
def __init__(self, options):
conn = S3Connection(options['s3']['user'], options['s3']['key'])
self.bucket = conn.get_bucket(options['s3']['bucket'], validate=False)
def stream_file(self, file):
key = ReadOnce(self.bucket.get_key(file))
gz_file = gzip.GzipFile(fileobj=key, mode='r')
reader = csv.DictReader(io.TextIOWrapper(
gz_file, newline="", encoding="utf-8-sig"), delimiter='\t')
for line in reader:
print(line)
def main(options):
tsr = ReadFromS3(options)
tsr.stream_file(<filename>)
if __name__ == "__main__":
options = {
's3':{
'user': <s3user>,
'key': <s3key>,
'bucket': <s3bucketname>,
}
}
main(options)