[toc]

python 文件的IO一

练习

有一个文件，对其进行单词统计，不区分大小写，并显示单词重复最多的10个单词

# 第一次处理
# os.path /usr/local/lib '/usr/local/lib'

def wordcount(file='sample.txt'):
    chars = '''~!@#$%^&*()_+{}[]|\\/"'=;:.-<>'''
    with open(file,encoding='utf-8') as f:
        word_count = {}
        for line in f:
            words = line.split()
            for k,v in zip(words,(1,)*len(words)):
                k  = k.strip(chars)
                k = k.lower()
                word_count[k] = word_count.get(k,0) + 1

    lst = sorted(word_count.items(), key=lambda  x:x[1],reverse=True)
    for i in range(10):
        print(str(lst[i]).strip("'()").replace(",",""))
    return lst

wordcount()
# 输出
the, 5
a, 5
can, 3
download, 3
and, 3
for, 3
version, 2
these, 2
in, 2
this, 2

第二次改进

def wordcount(file='sample.txt'):
    chars = '''~!@#$%^&*()_+{}[]|\\/"'=;:.-<>'''
    charset = set(chars)
    with open(file,encoding='utf-8') as f:
        word_count = {}
        for line in f:
            words = line.split()
            for k,v in zip(words,(1,)*len(words)):
                k  = k.strip(chars)
                if len(k) < 1:
                    continue
                k = k.lower()
                start = 0
                for i, c in enumerate(k):
                    if c in charset:
                        if start ==i :
                            start = i +1
                            continue
                        key = k[start:i]
                        word_count[key] = word_count.get(key,0) +1
                        start = i + 1
                else:
                    key = k[start:]
                    word_count[key] = word_count.get(key,0) +1
                print()

    lst = sorted(word_count.items(), key=lambda  x:x[1],reverse=True)
    for i in range(10):
        if i < len(lst):
            print(str(lst[i]).strip("'()").replace("'",""))
    return lst

wc = wordcount()
print(wc)
print(len(wc))
the, 5
a, 5
can, 3
download, 3
and, 3
for, 3
version, 2
these, 2
in, 2
this, 2
[('the', 5), ('a', 5), ('can', 3), ('download', 3), ('and', 3), ('for', 3), ('version', 2), ('these', 2), ('in', 2), ('this', 2), ('to', 2), ('as', 2), ('new', 2), ('of', 2), ('changes,', 2), ('with', 2), ('linux', 2), ('common', 2), ('control', 1), ('systems', 1), ('keep', 1), ('revisions', 1), ('straight,', 1), ('storing', 1), ('modifications', 1), ('central', 1), ('repository', 1), ('allows', 1), ('developers', 1), ('easily', 1), ('collaborate,', 1), ('they', 1), ('software,', 1), ('make', 1), ('upload', 1), ('newest', 1), ('revision', 1), ('every', 1), ('developer', 1), ('see', 1), ('them,', 1), ('contribute', 1), ('similarly,', 1), ('people', 1), ('who', 1), ('have', 1), ('nothing', 1), ('do', 1), ('development', 1), ('project', 1), ('still', 1), ('files', 1), ('use', 1), ('them', 1), ('most', 1), ('users', 1), ('should', 1), ('be', 1), ('familiar', 1), ('process,', 1), ('using', 1), ('git,', 1), ('subversion,', 1), ('or', 1), ('some', 1), ('other', 1), ('similar', 1), ('method', 1), ('is', 1), ('pretty', 1), ('downloading', 1), ('needed', 1), ('files—especially', 1), ('preparation', 1), ('compiling', 1), ('program', 1), ('from', 1), ('source', 1), ('code', 1), ('rather', 1), ('practice', 1), ('geeks', 1)]
82

StringIO

io 模块中的类;
from io import StringIO;
内存中，开辟的一个文本模式的buffer, 可以像文件对象一样操作它：
当close方法被调用的时候，这个buffer会被释放。

StringIO操作

getvalue() 获取全部内容。跟文件指针没有关系;

用法

from io import StringIO

sio = StringIO()
print(sio.readable(),sio.writable(),sio.seekable())
sio.write("asjin.com\npython")
sio.seek(0)
print(sio.readlines())
print(sio.getvalue())
print()
sio.seek(0)
print(sio.read())
sio.close()
# 输出
True True True
['asjin.com\n', 'python']
asjin.com
python
asjin.com
python

好处
一般来说，磁盘的操作比内存的操作要慢得多，内存足够的情况下，一般的优化思路是少落地，减少磁盘IO的过程，可以大大提高程序的运行效率。

BytesIO

io模块中的类
from io import BytesIO
内存中，开辟一个二进制模式的buffer，可以像文件对象一样操作它;
当close方法被调用的时候，这个buffer会被释放。

from io import BytesIO # 内存中构建
bio = BytesIO()
print(bio.readable(),bio.writable(),bio.seekable())
bio.write(b"ssjinyao.com\nPython")
bio.seek(0)
print(bio.readlines())
print(bio.getvalue()) # 无视指针，输出全部内容
bio.close()
# 输出
True True True
[b'ssjinyao.com\n', b'Python']
b'ssjinyao.com\nPython'

file-like 对象

类文件对象，可以像文件对象一样操作;
sock对象、输入输出对象(stdin、stdout)都是类文件对象;

from sys import stdout
f = stdout
print(type(f))
f.write('asjin.com')

os.path 模块

python3.4 之前

from os import path

p = path.join('/etc','hosts')
print(type(p),p)
print(path.exists(p))
print(path.split(p))
print(path.abspath('.'))
p = path.join('o://',p,'test.txt')
print(path.dirname(p))
print(path.basename(p))
print(path.splitdrive(p))
# 输出
<class 'str'> /etc/hosts
True
('/etc', 'hosts')
/Users/ssjinyao/Desktop/MyPython/testpy/kernel
/etc/hosts
test.txt
('', '/etc/hosts/test.txt')

3.4版本开始

建议使用pathlib模块，提供Path对象来操作。包括目录和文件；

pathlib模块

In [1]: from pathlib import Path
In [2]: p = Path()
In [3]: type(p)
Out[3]: pathlib.PosixPath

实现

In [1]: from pathlib import Path
In [2]: p = Path()
In [3]: type(p)
Out[3]: pathlib.PosixPath
In [4]: p.absolute()
Out[4]: PosixPath('/root/mypython/python3')
In [5]: p.joinpath('as','jin')
Out[5]: PosixPath('as/jin')
In [6]: p.absolute()
Out[6]: PosixPath('/root/mypython/python3')
In [7]: p = p.joinpath('as','jin')
In [8]: p.absolute()
Out[8]: PosixPath('/root/mypython/python3/as/jin')
In [9]: p = p / 'a' / 'b'
In [10]: p.absolute()
Out[10]: PosixPath('/root/mypython/python3/as/jin/a/b')

拼接

In [13]: p2 = Path('')
In [14]: p2 = p2 / '/etc/' / 'sysconfig'
In [15]: p2
Out[15]: PosixPath('/etc/sysconfig')

路径拼接和分解
操作符/
Path对象/ Path对象
Path对象/ 字符串或者字符串 / Path 对象
分解
parts 属性，可以返回路径中的每一个部分
joinpath
joinpath(*other)连接多个字符串到Path对象中

In [16]: p = Path()
In [17]: p = p / 'a'
In [18]: p1 = 'b' / p
In [19]: p2 = Path('c')
In [20]: p3 = p2 / p1
In [21]: print(p3.parts)
('c', 'b', 'a')
In [22]: p3.joinpath('etc','init.d',Path('httpd'))
Out[22]: PosixPath('c/b/a/etc/init.d/httpd')

获取路径
str获取路径字符串
bytes获取路径字符串的bytes

In [24]: p = Path('/etc')
In [25]: print(str(p),bytes(p))
/etc b'/etc'

parent 目录的逻辑父目录
parents 父目录序列，索引|0是直接父

p =Path('/a/b/c/d')
In [29]: p = Path('/a/b/c/d')

In [30]: print(p.parent.parent)
/a/
In [31]: for x in p.parents:
    ...:     print(x)
    ...:
/a/b/c
/a/b
/a
/

name、stem、 suffix、 suffixes、 with_suffix(suffix)、with_name(name)
name 目录的最后一个部分
suffix 目录中最后一个部分的扩展名
stem 目录最后一个部分，没有后缀
suffixes 返回多个扩展名列表
with_suffix(suffix)补充扩展名到路径尾部，返回新的路径，扩展名存在则无效;
with_name(name) 替换目录最后一个部分并返回一个新的路径。

In [33]: p4 = Path('/etc/sysconfig/network/xx.ifg.gz')
In [34]: p4.name
Out[34]: 'xx.ifg.gz'
In [35]: p4.stem
Out[35]: 'xx.ifg'
In [36]: p4.suffix
Out[36]: '.gz'
In [39]: p4.suffixes
Out[39]: ['.ifg', '.gz']

In [40]: p4.with_suffix('.tgz')
Out[40]: PosixPath('/etc/sysconfig/network/xx.ifg.tgz')
In [41]: p4.with_name('test')
Out[41]: PosixPath('/etc/sysconfig/network/test')

cwd() 返回当前工作目录;
home() 返回当前家目录;

is_dir() 是否是目录;
is_file() 是滞是普通文件;
is_symlink() 是否是软链接;
is_socket() 是否是socket文件;
is_block_device() 是否是块设备;
is_char_device() 是否是字符设备;
is_absolute() 是否是绝对路径;

resove() 返回一个新的路径，这个新路径就是当前Path对象的绝对路径，如果是软链接则直接被解析;
absolute() 也可以获取绝对路径，但是推荐使用resolve();
exists() 目录或文件是否存在;
remdir() 删除空目录。没有提供判断目录为空的方法;
touch(mode=0o666,exist_ok=True)创建一个文件;
as_uri() 将路径返回成URL，例如’file:///etc/passwd’
mkdir(mode=0o777,parents=False,exist_ok=False)
parents, 是否创建父目录，True等同于mkdir -p; False时父目录不存在，则抛出;
FileNotFoundError
exist_ok参数，在3.5版本加入。False时，路径存在，抛出FileExistsError; True 时FileExistsError被忽略；

In [3]: p =Path('/a/b/c/d')
In [4]: p.home()
Out[4]: PosixPath('/root')
In [5]: p.is_dir()
Out[5]: False
In [6]: p.is_absolute()
Out[6]: True
In [7]: p.absolute().parent
Out[7]: PosixPath('/a/b/c')

iterdir()
迭代当前目录

In [12]: for x in Path().iterdir():
    ...:     if x.is_dir():
    ...:         print(x)
.ipynb_checkpoints

通配符

glob(pattern)通配给定的模式
rglob(pattern)通配给定的模式，递归目录

list(p.glob('test*')) # 返回当前目录对象下的test开头的文件
list(p.glob('**/*.py')) # 递归所有目录，等同rglob
list(p.rglob('*.py'))

In [1]: from pathlib import Path
In [2]: list(Path().glob('*.py'))
Out[2]:
 PosixPath('random_max_min2.py'),

匹配
match(pattern)
模式匹配，成功返回True

In [4]: Path('.py').match('*.py')
Out[4]: True
In [5]: Path('/a/b/c.py').match('b/*.py')
Out[5]: True
In [6]: Path('/a/b/c.py').match('a/*.py')
Out[6]: False
In [7]: Path('/a/b/c.py').match('a/*/*.py')
Out[7]: True
In [8]: Path('/a/b/c.py').match('a/**/*.py')
Out[8]: True
In [9]: Path('/a/b/c.py').match('**/*.py')
Out[9]: True

文件操作

open(mode=’r’,buffering=-1,encoding=None, errors=None,newline=None)
使用的方法类似内建函数open。返回一个文件对象

3.5 增加的新函数
read_bytes()
以’rb’ 读取路径对应文件，并返回二进制流;

Path.write_bytes(date)
以’wb’ 方式写入数据到路径对应文件;

write_text(data,enconding=None,errors=None)
以’wt’方式写入字符串到路径对应文件;

In [10]: p4 = Path('/root/testfile')
In [11]: p4.open('w+')
Out[11]: <_io.TextIOWrapper name='/root/testfile' mode='w+' encoding='UTF-8'>
In [12]: p4.touch()
In [13]: with p4.open('w+') as f:
    ...:     f.write('test')
    ...:
In [14]: f.closed
# 可以直接写入
In [15]: p4.write_text('test_write_text')
In [17]: p4.read_text()
Out[17]: 'test_write_text'

from pathlib import Path
p = Path('/etc/config.py')
p.write_text('workprocess=16')
print(p.read_text())
with p.open() as f:
    print(f.read(5))

os模块

os.name windows是nt，linux 是posix;
os.uname() Linux支持显示;
sys.platform windows 显示32，Linux是Linux
os.listdir(‘/etc’)
返回目录内容列表。
os 也有open、 read、 write 等方法，但是太低级，建议使用内建函数open、read、 write;
调用Linux系统的stat;
path: 路径的sting 或者bytes, 或者fd
follow symlinks True 返回文件本身信息，False如果是软链接则显示软链接本身;

os.chmod(path,mode,*,dir_fd=None, follow_symlinks=True)
os.chmod(‘test’,0o777)
os.chown(path,uid,gid)
改变文件的属主、属组，但需要足够的权限

shutil 模块

到目前为止
文件拷贝：使用打开2个文件对象，源文件读取内容，写入目标文件中来完成拷贝过程。但是这样丢失stat数据信息(权限等)，因为根本就没有复制过去。
目录怎么办呢？
Python提供了一个方便的库shutil(高级文件操作)。

copy复制

copyfileobj(fsrc,fdst[,length])

文件对象的复制，fsrc和fdst 是open打开的文件对象，复制内容。fdst要求可写。
length指定了表示buffer的大小;

from pathlib import Path
import shutil


p1 = Path('./test')
p1.touch()
with open('./test','r+') as f1:
    f1.write('abcd\n asjin')
    f1.flush()
    with open('./test1','w+') as f2:
        shutil.copyfileobj(f1,f2)
(py3_env) testpy ➤ cat kernel/test                                                                                                                                                      
abcd
asjin%                                                                                                                                                       (py3_env) testpy ➤ cat kernel/test1

copyfile(src,dst,*,follow_symlinks=True)
复制文件内容，不含元数据。src、dst为文件的路径字符串;
本质上调用的就是copyfileobj，所以不带元数据二进制内容复制;

import shutil
shutil.copyfile('./test','./test2')

copymode(src,dst,*,follow_symlinks=True)

仅仅复制权限

(py3_env) testpy ➤chmod 400 kernel/test2
import shutil
shutil.copymode('./test','./test2')
#执行后再查看copy的权限
-rw-r--r--  1 ssjinyao  staff     0B Oct 11 21:58 kernel/test1
-rw-r--r--  1 ssjinyao  staff    11B Oct 11 22:10 kernel/test2

copystat(src,dst,*,follow_symlinks=True)

复制元数据，stat包含权限

copy(src,dst,*,follow_symlinks=True)

复制文件内容、权限和部分元数据，不包括创建时间和修改时间。
本质上调用的是

copyfile(src,dst,follow_symlinks=follow_symlinks)
copymode(src,dst,follow_symlinks=follow_symlinks)

copy2 比copy多了复制全部元数据，但需要平台支持;
本质上调用的是
copyfile(src,dst,follow_symlinks=follow_symlinks)
copystat(src,dst,follow_symlinks=follow_symlinks)

copytree(src,dst,symlinks=False,ignore=None,copy_function=copy2,ignore_dangling_symlinks=False)

递归复制目录。默认使用copy2，也就是带更多的元数据复制；

src、dst 必须是目录，src必须是存在 dst必须不存在;
ignore=func, 提供一个callabe(src,names) -> ignored_names。提供一个函数，它会被调用。src是源目录，names是os.listdir(src)的结果，就是列出src中的文件名，返回值是要被过滤的文件名的set类型数据。

copytree 是最常用的

def ignore(src, names):
    ig = filter(lambda x: x.startswith('a'), names) #忽略a
    return set(ig)
shutil.copytree('./testdir','/test/dir',ignore=ignore)

rm删除

shutil.rmtree(path,ignore_errors=False,onerror=None)
递归删除。如同rm -rf 一样危险，慎用。
它不是原子操作，有可能删除错误，就会中断，已经删除的就删除了;
ignore_errors为ture，忽略错误。当为False或者omitted时onerror生效;
onerror为callable，接受函数function、path和execinfo。

shutil.rmtree('./test') #类似rm -rf

move 移动

move(src,dst,copy_function=copy2)

递归移动文件、目录到目标，返回目录;
本身移动文件、目录到目标，返回目标;
如果不支持rename，如果是目录则想copytree再删除源目录;
默认使用copy2方法。

os.rename('./test.txt','/temp/t')
os.rename('test3','/temp/test/test300')

shutil还有打包功能。生成tar并压缩。支持zip、gz、bz、xz。

格言

共情的实质–把你的生活扩展到别人的生活里，把你的耳朵放到别人的灵魂中，用心去聆听那里最急切的喃喃私语

python 文件的IO二