{"id":407,"date":"2024-12-21T13:55:21","date_gmt":"2024-12-21T05:55:21","guid":{"rendered":"https:\/\/www.laixuexila.com\/?p=407"},"modified":"2024-12-21T13:55:21","modified_gmt":"2024-12-21T05:55:21","slug":"jsoup-%e7%a4%ba%e4%be%8b%e6%8c%87%e5%8d%97","status":"publish","type":"post","link":"https:\/\/www.laixuexila.com\/index.php\/2024\/12\/21\/jsoup-%e7%a4%ba%e4%be%8b%e6%8c%87%e5%8d%97\/","title":{"rendered":"Jsoup \u793a\u4f8b\u6307\u5357"},"content":{"rendered":"\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p class=\"wp-block-paragraph\"><code>Jsoup<\/code>\u00a0\u662f\u4e00\u4e2a\u975e\u5e38\u6d41\u884c\u7684 Java \u5e93\uff0c\u7528\u4e8e\u89e3\u6790 HTML \u6587\u6863\u3001\u5904\u7406\u7f51\u9875\u5185\u5bb9\u548c\u8fdb\u884c\u7f51\u9875\u6293\u53d6\u3002\u5b83\u63d0\u4f9b\u4e86\u5f3a\u5927\u7684 HTML \u89e3\u6790\u80fd\u529b\uff0c\u5e76\u652f\u6301\u7c7b\u4f3c jQuery \u7684 DOM \u64cd\u4f5c\u63a5\u53e3\uff0c\u9002\u7528\u4e8e\u7f51\u9875\u5185\u5bb9\u63d0\u53d6\u3001\u4fee\u6539\u4ee5\u53ca HTML \u5143\u7d20\u64cd\u4f5c\u7b49\u4efb\u52a1\u3002\u4ee5\u4e0b\u662f\u8be6\u7ec6\u7684 <strong>Jsoup \u793a\u4f8b\u6307\u5357<\/strong>\uff0c\u6db5\u76d6\u4e86\u5e38\u89c1\u7684\u4f7f\u7528\u573a\u666f\u548c\u529f\u80fd\u3002\u5e2e\u52a9\u4f60\u66f4\u597d\u5730\u7406\u89e3\u5982\u4f55\u4f7f\u7528\u8be5\u5e93\u3002<\/p>\n<\/blockquote>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">1. <strong>\u57fa\u672c\u7684 HTML \u89e3\u6790<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u89e3\u6790 HTML \u5b57\u7b26\u4e32<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\n\npublic class JsoupParseExample {\n    public static void main(String&#91;] args) {\n        String html = \"&lt;html&gt;&lt;head&gt;&lt;title&gt;Sample Page&lt;\/title&gt;&lt;\/head&gt;\"\n                    + \"&lt;body&gt;&lt;p&gt;Hello, World!&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;\";\n\n        \/\/ \u89e3\u6790 HTML \u5b57\u7b26\u4e32\n        Document doc = Jsoup.parse(html);\n\n        \/\/ \u83b7\u53d6\u6807\u9898\n        System.out.println(\"Title: \" + doc.title()); \/\/ \u8f93\u51fa: Sample Page\n\n        \/\/ \u83b7\u53d6 &lt;p&gt; \u6807\u7b7e\u4e2d\u7684\u6587\u672c\n        System.out.println(\"Paragraph: \" + doc.select(\"p\").text()); \/\/ \u8f93\u51fa: Hello, World!\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">2. <strong>\u4ece URL \u89e3\u6790 HTML<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u4ece\u7f51\u7ad9\u52a0\u8f7d HTML<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\n\nimport java.io.IOException;\n\npublic class JsoupFromURLExample {\n    public static void main(String&#91;] args) {\n        try {\n            \/\/ \u8fde\u63a5\u5230 URL \u5e76\u6293\u53d6 HTML \u5185\u5bb9\n            Document doc = Jsoup.connect(\"https:\/\/example.com\").get();\n\n            \/\/ \u6253\u5370\u9875\u9762\u6807\u9898\n            System.out.println(\"Title: \" + doc.title());\n        } catch (IOException e) {\n            e.printStackTrace();\n        }\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">3. <strong>\u9009\u62e9\u5143\u7d20\u548c\u83b7\u53d6\u6587\u672c<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u9009\u62e9\u7279\u5b9a\u5143\u7d20\u5e76\u83b7\u53d6\u5176\u6587\u672c<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\n\npublic class JsoupSelectElements {\n    public static void main(String&#91;] args) {\n        String html = \"&lt;html&gt;&lt;body&gt;&lt;p class='intro'&gt;Hello&lt;\/p&gt;&lt;p&gt;World&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;\";\n\n        \/\/ \u89e3\u6790 HTML\n        Document doc = Jsoup.parse(html);\n\n        \/\/ \u9009\u62e9\u5e76\u6253\u5370\u5177\u6709 class 'intro' \u7684 &lt;p&gt; \u6807\u7b7e\u6587\u672c\n        System.out.println(\"Intro Paragraph: \" + doc.select(\".intro\").text()); \/\/ \u8f93\u51fa: Hello\n\n        \/\/ \u9009\u62e9\u6240\u6709 &lt;p&gt; \u6807\u7b7e\u5e76\u6253\u5370\n        doc.select(\"p\").forEach(p -&gt; {\n            System.out.println(\"Paragraph: \" + p.text());\n        });\n        \/\/ \u8f93\u51fa:\n        \/\/ Intro Paragraph: Hello\n        \/\/ Paragraph: World\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">4. <strong>\u83b7\u53d6\u548c\u4fee\u6539\u5c5e\u6027\u503c<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u83b7\u53d6\u548c\u4fee\u6539 HTML \u5143\u7d20\u7684\u5c5e\u6027<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes.Element;\n\npublic class JsoupAttributesExample {\n    public static void main(String&#91;] args) {\n        String html = \"&lt;html&gt;&lt;body&gt;&lt;a href='https:\/\/example.com' class='link'&gt;Click Here&lt;\/a&gt;&lt;\/body&gt;&lt;\/html&gt;\";\n\n        Document doc = Jsoup.parse(html);\n\n        \/\/ \u83b7\u53d6 &lt;a&gt; \u6807\u7b7e\u7684 href \u5c5e\u6027\n        Element link = doc.select(\"a\").first();\n        System.out.println(\"Link Href: \" + link.attr(\"href\")); \/\/ \u8f93\u51fa: https:\/\/example.com\n\n        \/\/ \u4fee\u6539 &lt;a&gt; \u6807\u7b7e\u7684 href \u5c5e\u6027\n        link.attr(\"href\", \"https:\/\/new-example.com\");\n        System.out.println(\"Updated Link Href: \" + link.attr(\"href\")); \/\/ \u8f93\u51fa: https:\/\/new-example.com\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">5. <strong>\u4fee\u6539 HTML \u5143\u7d20\u5185\u5bb9<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u4fee\u6539 HTML \u6807\u7b7e\u7684\u5185\u5bb9<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes.Element;\n\npublic class JsoupModifyContent {\n    public static void main(String&#91;] args) {\n        String html = \"&lt;html&gt;&lt;body&gt;&lt;p id='message'&gt;Original message&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;\";\n\n        Document doc = Jsoup.parse(html);\n\n        \/\/ \u83b7\u53d6\u5e76\u4fee\u6539 &lt;p&gt; \u6807\u7b7e\u7684\u6587\u672c\n        Element p = doc.getElementById(\"message\");\n        p.text(\"Updated message\");\n\n        \/\/ \u8f93\u51fa\u4fee\u6539\u540e\u7684 HTML\n        System.out.println(doc.html()); \/\/ \u8f93\u51fa: &lt;html&gt;&lt;body&gt;&lt;p id=\"message\"&gt;Updated message&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">6. <strong>\u5220\u9664\u5143\u7d20<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u5220\u9664\u9875\u9762\u4e2d\u7684\u5143\u7d20<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\n\npublic class JsoupRemoveElement {\n    public static void main(String&#91;] args) {\n        String html = \"&lt;html&gt;&lt;body&gt;&lt;p&gt;This is a paragraph.&lt;\/p&gt;&lt;p class='remove'&gt;Remove this one&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;\";\n\n        Document doc = Jsoup.parse(html);\n\n        \/\/ \u5220\u9664\u5177\u6709 'remove' \u7c7b\u540d\u7684 &lt;p&gt; \u6807\u7b7e\n        doc.select(\".remove\").remove();\n\n        \/\/ \u8f93\u51fa\u4fee\u6539\u540e\u7684 HTML\n        System.out.println(doc.html()); \/\/ \u8f93\u51fa: &lt;html&gt;&lt;body&gt;&lt;p&gt;This is a paragraph.&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">7. <strong>\u904d\u5386 HTML \u5143\u7d20<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u904d\u5386\u5e76\u64cd\u4f5c\u591a\u4e2a\u5143\u7d20<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes.Element;\n\npublic class JsoupIterateExample {\n    public static void main(String&#91;] args) {\n        String html = \"&lt;html&gt;&lt;body&gt;&lt;p&gt;First paragraph&lt;\/p&gt;&lt;p&gt;Second paragraph&lt;\/p&gt;&lt;\/body&gt;&lt;\/html&gt;\";\n\n        Document doc = Jsoup.parse(html);\n\n        \/\/ \u904d\u5386\u6240\u6709 &lt;p&gt; \u6807\u7b7e\u5e76\u8f93\u51fa\u6587\u672c\u5185\u5bb9\n        for (Element p : doc.select(\"p\")) {\n            System.out.println(\"Paragraph: \" + p.text());\n        }\n        \/\/ \u8f93\u51fa:\n        \/\/ Paragraph: First paragraph\n        \/\/ Paragraph: Second paragraph\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">8. <strong>\u6293\u53d6\u5e26\u8868\u683c\u7684 HTML<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u6293\u53d6\u5e76\u89e3\u6790 HTML \u8868\u683c\u6570\u636e<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes.Element;\nimport org.jsoup.select.Elements;\n\npublic class JsoupTableParsing {\n    public static void main(String&#91;] args) {\n        String html = \"&lt;html&gt;&lt;body&gt;&lt;table&gt;\"\n                    + \"&lt;tr&gt;&lt;th&gt;Name&lt;\/th&gt;&lt;th&gt;Age&lt;\/th&gt;&lt;\/tr&gt;\"\n                    + \"&lt;tr&gt;&lt;td&gt;John&lt;\/td&gt;&lt;td&gt;25&lt;\/td&gt;&lt;\/tr&gt;\"\n                    + \"&lt;tr&gt;&lt;td&gt;Jane&lt;\/td&gt;&lt;td&gt;30&lt;\/td&gt;&lt;\/tr&gt;\"\n                    + \"&lt;\/table&gt;&lt;\/body&gt;&lt;\/html&gt;\";\n\n        Document doc = Jsoup.parse(html);\n\n        \/\/ \u83b7\u53d6\u6240\u6709\u8868\u683c\u884c\n        Elements rows = doc.select(\"table tr\");\n\n        \/\/ \u904d\u5386\u6bcf\u4e00\u884c\uff0c\u8f93\u51fa\u8868\u683c\u6570\u636e\n        for (Element row : rows) {\n            Elements cols = row.select(\"td\");\n            if (!cols.isEmpty()) {\n                System.out.println(cols.get(0).text() + \" is \" + cols.get(1).text() + \" years old.\");\n            }\n        }\n        \/\/ \u8f93\u51fa:\n        \/\/ John is 25 years old.\n        \/\/ Jane is 30 years old.\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">9. <strong>\u5904\u7406 HTTP Cookies \u548c User-Agent<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u8bbe\u7f6e Cookie \u548c User-Agent<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\n\nimport java.io.IOException;\n\npublic class JsoupWithCookies {\n    public static void main(String&#91;] args) throws IOException {\n        Document doc = Jsoup.connect(\"https:\/\/example.com\")\n                            .cookie(\"user\", \"JohnDoe\")  \/\/ \u8bbe\u7f6e cookie\n                            .userAgent(\"Mozilla\/5.0 (Windows NT 10.0; Win64; x64)\")  \/\/ \u8bbe\u7f6e User-Agent\n                            .get();\n\n        \/\/ \u6253\u5370\u9875\u9762\u6807\u9898\n        System.out.println(\"Page Title: \" + doc.title());\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">10. <strong>\u5f02\u6b65\u8bf7\u6c42\u548c\u8d85\u65f6\u8bbe\u7f6e<\/strong><\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u793a\u4f8b\uff1a\u8bbe\u7f6e\u8fde\u63a5\u8d85\u65f6<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code>import org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\n\nimport java.io.IOException;\n\npublic class JsoupTimeoutExample {\n    public static void main(String&#91;] args) {\n        try {\n            \/\/ \u8bbe\u7f6e\u8fde\u63a5\u8d85\u65f6\u65f6\u95f4\uff08\u5355\u4f4d\uff1a\u6beb\u79d2\uff09\n            Document doc = Jsoup.connect(\"https:\/\/example.com\")\n                                .timeout(5000)  \/\/ \u8bbe\u7f6e5\u79d2\u8d85\u65f6\n                                .get();\n\n            \/\/ \u8f93\u51fa\u7f51\u9875\u6807\u9898\n            System.out.println(\"Page Title: \" + doc.title());\n        } catch (IOException e) {\n            e.printStackTrace();\n        }\n    }\n}<\/code><\/pre>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h3 class=\"wp-block-heading\">\u603b\u7ed3<\/h3>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u89e3\u6790 HTML<\/strong>: \u4f7f\u7528 <code>Jsoup.parse()<\/code> \u6765\u89e3\u6790 HTML \u5b57\u7b26\u4e32\uff0c\u6216\u4f7f\u7528 <code>Jsoup.connect()<\/code> \u4ece URL \u83b7\u53d6 HTML\u3002<\/li>\n\n\n\n<li><strong>\u9009\u62e9\u5143\u7d20<\/strong>: \u4f7f\u7528 <code>select()<\/code> \u65b9\u6cd5\u6839\u636e CSS \u9009\u62e9\u5668\u9009\u62e9\u5143\u7d20\uff0c\u7c7b\u4f3c\u4e8e jQuery \u7684\u8bed\u6cd5\u3002<\/li>\n\n\n\n<li><strong>\u4fee\u6539\u5185\u5bb9<\/strong>: \u4f7f\u7528 <code>text()<\/code>\u3001<code>html()<\/code>\u3001<code>attr()<\/code> \u7b49\u65b9\u6cd5\u6765\u4fee\u6539 HTML \u5143\u7d20\u7684\u5185\u5bb9\u6216\u5c5e\u6027\u3002<\/li>\n\n\n\n<li><strong>\u64cd\u4f5c\u8868\u683c\u548c\u5217\u8868<\/strong>: \u53ef\u4ee5\u8f7b\u677e\u63d0\u53d6\u8868\u683c\u6570\u636e\u548c\u5176\u4ed6\u6709\u5e8f\u5217\u8868\u6570\u636e\u3002<\/li>\n\n\n\n<li><strong>\u6293\u53d6\u7f51\u9875<\/strong>: <code>Jsoup<\/code> \u53ef\u4ee5\u4e0e\u7f51\u7edc\u8bf7\u6c42\u7ed3\u5408\uff0c\u6293\u53d6\u5e76\u89e3\u6790\u7f51\u9875\u5185\u5bb9\uff0c\u8fd8\u652f\u6301\u8bbe\u7f6e HTTP headers\u3001cookies \u548c timeout \u7b49\u3002<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd9\u4e9b\u793a\u4f8b\u6db5\u76d6\u4e86 <code>Jsoup<\/code> \u4e2d\u7684\u5e38\u89c1\u64cd\u4f5c\uff0cJsoup \u662f\u4e00\u4e2a\u975e\u5e38\u5f3a\u5927\u7684\u5de5\u5177\uff0c\u5728\u5904\u7406 HTML \u548c\u7f51\u9875\u6293\u53d6\u65f6\u63d0\u4f9b\u4e86\u5f88\u591a\u4fbf\u5229\u7684\u529f\u80fd\u3002\u4f60\u53ef\u4ee5\u6839\u636e\u9700\u8981\u9009\u62e9\u5408\u9002\u7684\u65b9\u6cd5\u6765\u89e3\u6790\u3001\u4fee\u6539\u548c\u64cd\u4f5c HTML \u5185\u5bb9\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Jsoup\u00a0\u662f\u4e00\u4e2a\u975e\u5e38\u6d41\u884c\u7684 Java \u5e93\uff0c\u7528\u4e8e\u89e3\u6790 HTML \u6587\u6863\u3001\u5904\u7406\u7f51\u9875\u5185\u5bb9\u548c\u8fdb\u884c\u7f51\u9875\u6293\u53d6\u3002\u5b83\u63d0\u4f9b\u4e86\u5f3a\u5927 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[56],"tags":[],"class_list":["post-407","post","type-post","status-publish","format-standard","hentry","category-html"],"_links":{"self":[{"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/posts\/407","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/comments?post=407"}],"version-history":[{"count":1,"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/posts\/407\/revisions"}],"predecessor-version":[{"id":408,"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/posts\/407\/revisions\/408"}],"wp:attachment":[{"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/media?parent=407"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/categories?post=407"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.laixuexila.com\/index.php\/wp-json\/wp\/v2\/tags?post=407"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}