PlayWright实现URL转为PDF文件

前言

​ 需求是将一个网址页面的内容输出为一个PDF文件,中间试了很多种方法去实现,尝试了Flying Saucer、OpenHTMLToPDF、JSoup、PDFBox等技术,思路大部分都是通过URL获取到HTML页面,然后解析HTML页面内容最后生成PDF文件。但是由于URL的特殊性(政府网址),很难获取到完整的HTML页面,标签没闭合等等问题,也就导致最后生成的PDF为空白文件。最后尝试了PlayWright通过截屏的方式实现将一个URL转为PDF文件并上传OSS,下面就来详细展开一下。

引入依赖

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<repositories>
<repository>
<id>github-playwright</id>
<url>https://maven.pkg.github.com/microsoft/playwright-java</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>

<dependency>
<groupId>com.microsoft.playwright</groupId>
<artifactId>playwright</artifactId>
<version>1.49.0</version>
</dependency>

实现工具类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

object HtmlToPdfExporter {

fun htmlToPdfAutoPaging(
inputUrl: String,//入参页面的URL
outputPdf: String,//文件保存地址
pageFormat: String,//PDF格式大小,例“A4”
landscape: Boolean,//false
waitTime: Int,//设置合适等待时间,6000即可
): Boolean {
return try {
val uri = URI(inputUrl)
if (!("http".equals(
uri.scheme,
ignoreCase = true
) || "https".equals(uri.scheme, ignoreCase = true))
) {
throw IllegalArgumentException("只支持 http/https 在线地址")
}

// 确保目录存在
val outFile = File(outputPdf)
outFile.parentFile.mkdirs()

Playwright.create().use { playwright ->
val browser = playwright.chromium().launch(
BrowserType.LaunchOptions().setHeadless(true)
)
val page = browser.newPage()

println("[INFO] 开始加载: $inputUrl")
page.navigate(
inputUrl, Page.NavigateOptions()
.setWaitUntil(WaitUntilState.NETWORKIDLE)
.setTimeout(waitTime * 2.0)
)

// 等待 body 渲染出来
page.waitForSelector(
"#content, body", Page.WaitForSelectorOptions()
.setTimeout(waitTime.toDouble())
)

// 自动滚动,触发懒加载
page.evaluate(
"() => new Promise(resolve => {" +
"let total = 0, step = 500;" +
"const timer = setInterval(() => {" +
" window.scrollBy(0, step);" +
" total += step;" +
" if (total >= document.body.scrollHeight) {" +
" clearInterval(timer);" +
" resolve();" +
" }" +
"}, 200);" +
"})"
)
page.waitForTimeout(6000.0)

// 截图调试
val screenshotPath = outputPdf.replace(".pdf", ".png")
page.screenshot(
Page.ScreenshotOptions()
.setPath(Paths.get(screenshotPath))
.setFullPage(true)
)
println("[DEBUG] 截图已保存: $screenshotPath")

// 检查是否有 iframe
if (page.frames().isNotEmpty() && page.frames().size > 1) {
println("[INFO] 检测到 iframe 数量: ${page.frames().size}")
for (frame in page.frames()) {
if (frame.url() != "about:blank" && frame.url() != inputUrl) {
println("[INFO] 尝试使用 iframe 内容: ${frame.url()}")
break
}
}
}

// 覆盖打印 CSS,避免内容隐藏
page.addStyleTag(
Page.AddStyleTagOptions()
.setContent(
"""
<style>
@media print {
body { margin: 0 !important; padding: 0 !important; min-height: auto !important; }
* { float: none !important; position: static !important; visibility: visible !important; display: block !important; }
table { page-break-inside: auto !important; }
tr { page-break-inside: avoid !important; page-break-after: auto !important; }
thead { display: table-header-group; }
tfoot { display: table-footer-group; }
div, p { break-inside: avoid; }
}
</style>
"""
)
)

// 导出 PDF
page.pdf(
Page.PdfOptions()
.setPath(Paths.get(outputPdf))
.setFormat(pageFormat)
.setLandscape(landscape)
.setMargin(
Margin()
.setTop("0.1in")
.setBottom("0.5in")
.setLeft("0.5in")
.setRight("0.5in")
)
.setPrintBackground(true)
.setPreferCSSPageSize(true)
.setScale(1.0)
)
println("[SUCCESS] PDF 已生成: $outputPdf")
browser.close()
true
}
} catch (e: Exception) {
System.err.println("[ERROR] 生成失败: $inputUrl\n${e.message}")
e.printStackTrace()
false
}
}
}