2007-07-18
Python Cookbook 2.14 重定位到输入文件到文件头
需求:
你构造了一个文件对象(从网络或者文件句柄中读数据),需要定位到文件头,以便读取它的信息.
讨论:
将文件操作封装到一个类里面:
from cStringIO import StringIO
class RewindableFile(object):
""" Wrap a file handle to allow seeks back to the beginning. """
def _ _init_ _(self, input_file):
""" Wraps input_file into a file-like object with rewind. """
self.file = input_file
self.buffer_file = StringIO( )
self.at_start = True
try:
self.start = input_file.tell( )
except (IOError, AttributeError):
self.start = 0
self._use_buffer = True
def seek(self, offset, whence=0):
""" Seek to a given byte position.
Must be: whence == 0 and offset == self.start
"""
if whence != 0:
raise ValueError("whence=%r; expecting 0" % (whence,))
if offset != self.start:
raise ValueError("offset=%r; expecting %s" % (offset, self.start))
self.rewind( )
def rewind(self):
""" Simplified way to seek back to the beginning. """
self.buffer_file.seek(0)
self.at_start = True
def tell(self):
""" Return the current position of the file (must be at start). """
if not self.at_start:
raise TypeError("RewindableFile can't tell except at start of file")
return self.start
def _read(self, size):
if size < 0: # read all the way to the end of the file
y = self.file.read( )
if self._use_buffer:
self.buffer_file.write(y)
return self.buffer_file.read ( ) + y
elif size == 0: # no need to actually read the empty string
return ""
x = self.buffer_file.read(size)
if len(x) < size:
y = self.file.read(size - len(x))
if self._use_buffer:
self.buffer_file.write(y)
return x + y
return x
def read(self, size=-1):
""" Read up to 'size' bytes from the file.
Default is -1, which means to read to end of file.
"""
x = self._read(size)
if self.at_start and x:
self.at_start = False
self._check_no_buffer( )
return x
def readline(self):
""" Read a line from the file. """
# Can we get it out of the buffer_file?
s = self.buffer_file.readline ( )
if s[-1:] == "\n":
return s
# No, so read a line from the input file
t = self.file.readline ( )
if self._use_buffer:
self.buffer_file.write (t)
self._check_no_buffer( )
return s + t
def readlines(self):
"""read all remaining lines from the file"""
return self.read( ).splitlines(True)
def _check_no_buffer(self):
# If 'nobuffer' has been called and we're finished with the buffer file,
# get rid of the buffer, redirect everything to the original input file.
if not self._use_buffer and \
self.buffer_file.tell( ) == len(self.buffer_file.getvalue( )):
# for top performance, we rebind all relevant methods in self
for n in 'seek tell read readline readlines'.split( ):
setattr(self, n, getattr(self.file, n, None))
del self.buffer_file
def nobuffer(self):
"""tell RewindableFile to stop using the buffer once it's exhausted"""
self._use_buffer = False
有时,从网络或文件句柄中读取的数据并不是我们所期望的.比如,你从一个有问题的服务器读数据,本来应该返回XML流,可是它返回了一些没有格式的错误消息.(这种情况很常见,因为很多服务器并没有很好的处理异常输入情况).
本节的RewindableFile类可以帮助你解决这个问题,r = RewindableFile(f) 将原始的输入流 f 封装到一个"可重定位的文件"对象 r 中,并提供了缓冲区功能.对 r 的读操作被转发给 f ,数据先保存到了缓冲区,然后返回给调用者,而缓冲区里面保存了所有的数据.
r 可以调用rewind方法,即能重定位到文件头,下一次的读操作将首先从缓冲区中读数据,读完后,再从输入流读,而新读入的数据也会添加到缓冲区中.
当不需要缓冲区的时候,我们调用 r 的nobuffer方法来释放它.也就是说,当我们读完数据的时候,可以调用这个方法来释放缓冲区,当调用nobuffer后,seek就没有意义了.
举例说明,你要访问的服务器会返回错误信息:ERROR:can't do that,或者XML信息,<?xml...:
import RewindableFile
infile = urllib2.urlopen("http://somewhere/")
infile = RewindableFile.RewindableFile(infile)
s = infile.readline( )
if s.startswith("ERROR:"):
raise Exception(s[:-1])
infile.seek(0)
infile.nobuffer( ) # Don't buffer the data any more
... process the XML from infile ...
在本节中,一个常用的Python技巧不适用了:你不能够很好的隐藏RewindableFie的绑定方法(如果你不知道什么是绑定方法,那没有关系,这样的话,你更不可能将它们隐藏起来).这个问题的原因是,当缓冲区是空的时,RewindableFile重新对read,readline等方法进行赋值,并将它们做为self对象的一个变量.比起使用不常用的隐藏绑定方法技巧 ,这样做性能会好一些.
tell方法用于获得当前文件指针的位置,它只能在封装完RewindalbeFile后调用,而且在进行读操作之前.RewindableFile实现的tell方法获得封装后的文件的真实位置,并将它做为起始位置.如果封装的文件对象不支持tell,那么RewindableFile类的tell实现将返回0.
你构造了一个文件对象(从网络或者文件句柄中读数据),需要定位到文件头,以便读取它的信息.
讨论:
将文件操作封装到一个类里面:
from cStringIO import StringIO
class RewindableFile(object):
""" Wrap a file handle to allow seeks back to the beginning. """
def _ _init_ _(self, input_file):
""" Wraps input_file into a file-like object with rewind. """
self.file = input_file
self.buffer_file = StringIO( )
self.at_start = True
try:
self.start = input_file.tell( )
except (IOError, AttributeError):
self.start = 0
self._use_buffer = True
def seek(self, offset, whence=0):
""" Seek to a given byte position.
Must be: whence == 0 and offset == self.start
"""
if whence != 0:
raise ValueError("whence=%r; expecting 0" % (whence,))
if offset != self.start:
raise ValueError("offset=%r; expecting %s" % (offset, self.start))
self.rewind( )
def rewind(self):
""" Simplified way to seek back to the beginning. """
self.buffer_file.seek(0)
self.at_start = True
def tell(self):
""" Return the current position of the file (must be at start). """
if not self.at_start:
raise TypeError("RewindableFile can't tell except at start of file")
return self.start
def _read(self, size):
if size < 0: # read all the way to the end of the file
y = self.file.read( )
if self._use_buffer:
self.buffer_file.write(y)
return self.buffer_file.read ( ) + y
elif size == 0: # no need to actually read the empty string
return ""
x = self.buffer_file.read(size)
if len(x) < size:
y = self.file.read(size - len(x))
if self._use_buffer:
self.buffer_file.write(y)
return x + y
return x
def read(self, size=-1):
""" Read up to 'size' bytes from the file.
Default is -1, which means to read to end of file.
"""
x = self._read(size)
if self.at_start and x:
self.at_start = False
self._check_no_buffer( )
return x
def readline(self):
""" Read a line from the file. """
# Can we get it out of the buffer_file?
s = self.buffer_file.readline ( )
if s[-1:] == "\n":
return s
# No, so read a line from the input file
t = self.file.readline ( )
if self._use_buffer:
self.buffer_file.write (t)
self._check_no_buffer( )
return s + t
def readlines(self):
"""read all remaining lines from the file"""
return self.read( ).splitlines(True)
def _check_no_buffer(self):
# If 'nobuffer' has been called and we're finished with the buffer file,
# get rid of the buffer, redirect everything to the original input file.
if not self._use_buffer and \
self.buffer_file.tell( ) == len(self.buffer_file.getvalue( )):
# for top performance, we rebind all relevant methods in self
for n in 'seek tell read readline readlines'.split( ):
setattr(self, n, getattr(self.file, n, None))
del self.buffer_file
def nobuffer(self):
"""tell RewindableFile to stop using the buffer once it's exhausted"""
self._use_buffer = False
有时,从网络或文件句柄中读取的数据并不是我们所期望的.比如,你从一个有问题的服务器读数据,本来应该返回XML流,可是它返回了一些没有格式的错误消息.(这种情况很常见,因为很多服务器并没有很好的处理异常输入情况).
本节的RewindableFile类可以帮助你解决这个问题,r = RewindableFile(f) 将原始的输入流 f 封装到一个"可重定位的文件"对象 r 中,并提供了缓冲区功能.对 r 的读操作被转发给 f ,数据先保存到了缓冲区,然后返回给调用者,而缓冲区里面保存了所有的数据.
r 可以调用rewind方法,即能重定位到文件头,下一次的读操作将首先从缓冲区中读数据,读完后,再从输入流读,而新读入的数据也会添加到缓冲区中.
当不需要缓冲区的时候,我们调用 r 的nobuffer方法来释放它.也就是说,当我们读完数据的时候,可以调用这个方法来释放缓冲区,当调用nobuffer后,seek就没有意义了.
举例说明,你要访问的服务器会返回错误信息:ERROR:can't do that,或者XML信息,<?xml...:
import RewindableFile
infile = urllib2.urlopen("http://somewhere/")
infile = RewindableFile.RewindableFile(infile)
s = infile.readline( )
if s.startswith("ERROR:"):
raise Exception(s[:-1])
infile.seek(0)
infile.nobuffer( ) # Don't buffer the data any more
... process the XML from infile ...
在本节中,一个常用的Python技巧不适用了:你不能够很好的隐藏RewindableFie的绑定方法(如果你不知道什么是绑定方法,那没有关系,这样的话,你更不可能将它们隐藏起来).这个问题的原因是,当缓冲区是空的时,RewindableFile重新对read,readline等方法进行赋值,并将它们做为self对象的一个变量.比起使用不常用的隐藏绑定方法技巧 ,这样做性能会好一些.
tell方法用于获得当前文件指针的位置,它只能在封装完RewindalbeFile后调用,而且在进行读操作之前.RewindableFile实现的tell方法获得封装后的文件的真实位置,并将它做为起始位置.如果封装的文件对象不支持tell,那么RewindableFile类的tell实现将返回0.