Python 高效操作文件 与 HTML 网页基础入门
(2025–2026 实用写法,适合有基础但想快速上手文件/网页处理的同学)
一、文件操作(推荐现代写法)
1. 文本文件最推荐的写法(with + utf-8)
# 读取整个文件(小文件推荐)
with open('data.txt', 'r', encoding='utf-8') as f:
content = f.read() # 一次性读完
# 或
lines = f.readlines() # 按行读 → 列表
# 或
for line in f: # 最省内存(一行一行读)
print(line.strip())
# 写入(覆盖模式)
with open('output.txt', 'w', encoding='utf-8') as f:
f.write("第一行\n第二行\n")
f.writelines(["第三行\n", "第四行\n"])
# 追加模式(日志最常用)
with open('log.txt', 'a', encoding='utf-8') as f:
f.write("2026-03-03 新记录\n")
2. 按行高效处理大文件(推荐写法)
# 方法1:最省内存(一行一行处理,不加载全部)
with open('very_big.log', 'r', encoding='utf-8') as f:
for line in f:
if "ERROR" in line:
print(line.strip())
# 方法2:使用 pathlib(Python 3.5+ 更面向对象)
from pathlib import Path
file = Path("config.ini")
text = file.read_text(encoding="utf-8") # 读全部
lines = file.read_text(encoding="utf-8").splitlines()
# 写入
Path("newfile.txt").write_text("内容", encoding="utf-8")
3. CSV 文件(最常用场景)
import csv
# 读取
with open('users.csv', 'r', encoding='utf-8', newline='') as f:
reader = csv.DictReader(f) # 推荐!自动把首行当字段名
for row in reader:
print(row['username'], row['age'])
# 写入(自动处理引号、逗号转义)
with open('output.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['id', 'name', 'score'])
writer.writeheader()
writer.writerow({'id': 1, 'name': '小明', 'score': 98.5})
writer.writerows([
{'id': 2, 'name': '小红', 'score': 92},
{'id': 3, 'name': '小刚', 'score': 85}
])
4. JSON 文件(API、配置文件最常用)
import json
# 读
with open('config.json', 'r', encoding='utf-8') as f:
data = json.load(f) # 直接得到 dict / list
# 写(自动格式化)
with open('new_config.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
5. 文件/目录常用操作(pathlib 现代写法)
from pathlib import Path
import shutil
p = Path("folder/subfolder/file.txt")
print(p.exists()) # 是否存在
print(p.is_file()) # 是文件吗
print(p.is_dir()) # 是目录吗
print(p.parent) # 父目录
print(p.name) # 文件名
print(p.stem) # 不带扩展名的名字
print(p.suffix) # .txt
# 创建目录(递归创建)
Path("logs/2026/03").mkdir(parents=True, exist_ok=True)
# 复制文件 / 目录
shutil.copy("source.txt", "backup.txt")
shutil.copytree("src_folder", "dst_folder")
# 删除(小心!)
# p.unlink() # 删除文件
# p.rmdir() # 删除空目录
# shutil.rmtree("folder") # 强制递归删除目录(危险)
二、HTML 网页基础操作(最常用三种方式)
1. requests + BeautifulSoup(爬虫/解析最经典组合)
import requests
from bs4 import BeautifulSoup
url = "https://www.example.com/news"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120"
}
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 常用提取方式
title = soup.title.string
h1 = soup.find('h1').get_text(strip=True)
all_links = soup.find_all('a')
# CSS 选择器(更强大)
news_items = soup.select('div.news-item > h2 > a')
for item in news_items:
print(item.get_text(), item['href'])
else:
print("请求失败", response.status_code)
2. 保存网页为本地文件(最简单静态保存)
# 方法1:直接保存源码
with open('page.html', 'w', encoding='utf-8') as f:
f.write(response.text)
# 方法2:用 requests-html(支持 JS 渲染,需 pip install requests-html)
from requests_html import HTMLSession
session = HTMLSession()
r = session.get("https://dynamic-page.com")
r.html.render(timeout=20) # 等待 JS 执行
print(r.html.html) # 渲染后的完整 HTML
3. 快速生成简单 HTML(模板 / 报表常用)
# 方法1:字符串拼接(小页面够用)
html = f"""
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8">
<title>报告</title>
</head>
<body>
<h1>数据统计 - {today}</h1>
<table border="1">
<tr><th>姓名</th><th>分数</th></tr>
<tr><td>小明</td><td>98</td></tr>
</table>
</body>
</html>
"""
with open("report.html", "w", encoding="utf-8") as f:
f.write(html)
4. Jinja2 模板(推荐:报表、邮件、爬虫伪造页面)
# pip install jinja2
from jinja2 import Template
template_str = """
<h1>欢迎 {{ name }}</h1>
<ul>
{% for item in items %}
<li>{{ item }}</li>
{% endfor %}
</ul>
"""
t = Template(template_str)
html = t.render(name="张三", items=["苹果", "香蕉", "橙子"])
with open("welcome.html", "w", encoding="utf-8") as f:
f.write(html)
快速记忆口诀(面试/日常常用)
文件操作:
- 小文件 →
with open(..., 'r', encoding='utf-8') as f: f.read() - 大文件 →
for line in f: - 结构化数据 →
csv.DictReader/json.load - 路径操作 → 用
pathlib.Path
网页操作:
- 静态页面 →
requests.get() + BeautifulSoup - 动态页面 →
requests-html或selenium(重) - 生成 HTML → 字符串 / Jinja2
需要我继续深入哪个方向?
- Excel 操作(pandas / openpyxl)
- PDF 读写(PyPDF2 / pdfplumber)
- 图片处理(PIL / pillow)
- requests 高级用法(代理、session、重试、异步)
- BeautifulSoup 更复杂的选择器写法
直接告诉我你最想学的部分~