{"id":49757,"date":"2023-08-10T23:50:24","date_gmt":"2023-08-10T15:50:24","guid":{"rendered":"http:\/\/xyryd.com\/?p=49757"},"modified":"2024-03-30T14:31:54","modified_gmt":"2024-03-30T06:31:54","slug":"%e4%bd%bf%e7%94%a8python3%e6%8b%86%e5%88%86%e5%a4%a7%e6%96%87%e4%bb%b6txt-%e5%9b%be%e6%96%87%e6%95%99%e7%a8%8b","status":"publish","type":"post","link":"http:\/\/www.xyryd.com\/49757.html","title":{"rendered":"\u4f7f\u7528python3\u62c6\u5206\u5927\u6587\u4ef6txt \u56fe\u6587\u6559\u7a0b"},"content":{"rendered":"\n

\u6700\u8fd1\u7f51\u7ad9\u88ab\u653b\u51fb\uff0ccdn\u4e00\u4e0b\u88ab\u62532TB\uff0c\u7136\u540e\u4e0b\u8f7d\u4e86\u65e5\u5fd7\u8fdb\u884c\u5206\u6790\uff0c\u4f46\u662f\u65e5\u5fd7\u6587\u4ef6\u6709\u51e0\u5341\u5146\uff0c\u6240\u4ee5\u9700\u8981\u505a\u4e00\u4e0b\u5207\u5272\uff0c\u8fd9\u91cc\u6211\u4eec\u8bb0\u4e00\u4e0bpython3<\/code>\u7684\u62c6\u5206\u6587\u4ef6\u811a\u672c\uff0c\u4ee5\u5907\u540e\u7528\u3002<\/p>\n\n\n\n

<\/figure>\n\n\n\n

Python\u4f5c\u4e3a\u5feb\u901f\u5f00\u53d1\u5de5\u5177\uff0c\u5176\u4ee3\u7801\u8868\u8fbe\u529b\u5f3a\uff0c\u5f00\u53d1\u6548\u7387\u9ad8\uff0c\u56e0\u6b64\u7528Python\u5feb\u901f\u5199\u4e00\u4e2a\uff0c\u8fd8\u662f\u53ef\u884c\u7684\u3002<\/p>\n\n\n\n

python3\u4ee3\u7801\u811a\u672c<\/h2>\n\n\n\n
import os\r\nimport sys\r\nimport random\r\nimport threading\r\nimport requests\r\nfrom urllib.parse import urlparse, urljoin\r\nfrom bs4 import BeautifulSoup\r\nimport re\r\nimport time\r\n\r\nlock = threading.Lock()\r\n\r\nclass TotalSizeCounter:\r\n    def __init__(self):\r\n        self.total_size = 0\r\n        self.lock = threading.Lock()\r\n\r\n    def add_size(self, size):\r\n        with self.lock:\r\n            self.total_size += size\r\n\r\n    def get_total_size(self):\r\n        with self.lock:\r\n            return self.total_size\r\n\r\ntotal_size_counter = TotalSizeCounter()\r\n\r\n# \u751f\u6210\u968f\u673a\u7684User-Agent\u5934\u90e8\u4fe1\u606f\r\ndef generate_user_agent():\r\n    user_agents = [\r\n        # iOS\r\n        \"Mozilla\/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit\/605.1.15 (KHTML, like Gecko) CriOS\/84.0.4147.122 Mobile\/15E148 Safari\/604.1\",\r\n        \"Mozilla\/5.0 (iPad; CPU OS 14_0 like Mac OS X) AppleWebKit\/605.1.15 (KHTML, like Gecko) CriOS\/84.0.4147.122 Mobile\/15E148 Safari\/604.1\",\r\n        \r\n        # Android\r\n        \"Mozilla\/5.0 (Linux; Android 11; Pixel 5) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/94.0.4606.54 Mobile Safari\/537.36\",\r\n        \"Mozilla\/5.0 (Linux; Android 11; SM-G998B) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/94.0.4606.54 Mobile Safari\/537.36\",\r\n        \r\n        # Windows\r\n        \"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/94.0.4606.54 Safari\/537.36\",\r\n        \"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Edge\/94.0.992.31\",\r\n        \"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Firefox\/100.0\",\r\n\r\n        # macOS\r\n        \"Mozilla\/5.0 (Macintosh; Intel Mac OS X 11_6_1) AppleWebKit\/605.1.15 (KHTML, like Gecko) Version\/14.1.2 Safari\/605.1.15\",\r\n        \"Mozilla\/5.0 (Macintosh; Intel Mac OS X 11_6_1) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/94.0.4606.54 Safari\/537.36\",\r\n        \r\n        # Linux\r\n        \"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/94.0.4606.54 Safari\/537.36\",\r\n        \"Mozilla\/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko\/20100101 Firefox\/95.0\"\r\n    ]\r\n    return random.choice(user_agents)\r\n\r\ndef download_image(url, user_agent, output_folder):\r\n    try:\r\n        headers = {'User-Agent': user_agent}\r\n        response = requests.get(url, headers=headers)\r\n        response.raise_for_status()\r\n\r\n        parsed_url = urlparse(url)\r\n        filename = os.path.join(output_folder, os.path.basename(parsed_url.path))\r\n        filename = f\"{os.path.splitext(filename)[0]}_{random.randint(1, 10000)}{os.path.splitext(filename)[1]}\"\r\n\r\n        with lock:\r\n            with open(filename, 'wb') as file:\r\n                file.write(response.content)\r\n\r\n                file_size = os.path.getsize(filename)\r\n                total_size_counter.add_size(file_size)\r\n                print(f\"Downloaded image {url} as {filename}, Size: {file_size \/ (1024 * 1024):.2f} MB\")\r\n\r\n    except Exception as e:\r\n        print(f\"Error downloading image {url}: {e}\")\r\n\r\ndef download_images(url, user_agent, output_folder):\r\n    try:\r\n        headers = {'User-Agent': user_agent}\r\n        response = requests.get(url, headers=headers)\r\n        response.raise_for_status()\r\n\r\n        soup = BeautifulSoup(response.content, 'html.parser')\r\n        img_tags = soup.find_all('img')\r\n\r\n        for img_tag in img_tags:\r\n            img_url = img_tag.get('src')\r\n            if img_url and not img_url.startswith(('data:', 'http:', 'http:')):\r\n                img_url = urljoin(url, img_url)\r\n                thread = threading.Thread(target=download_image, args=(img_url, user_agent, output_folder))\r\n                thread.start()\r\n                thread.join()\r\n\r\n        img_urls_from_text = re.findall(r'<img[^>]*data-src=[\"\\'](http?:\/\/[^\"\\']+\\.(?:png|jpg|jpeg|gif|bmp))[\"\\'][^>]*>', response.text)\r\n        for img_url in img_urls_from_text:\r\n            thread = threading.Thread(target=download_image, args=(img_url, user_agent, output_folder))\r\n            thread.start()\r\n            thread.join()\r\n\r\n    except Exception as e:\r\n        print(f\"Error downloading images from {url}: {e}\")\r\n\r\ndef main(url, num_iterations):\r\n    start_time = time.time()  # \u8bb0\u5f55\u5f00\u59cb\u65f6\u95f4\r\n\r\n    if not os.path.exists(\"files\"):\r\n        os.makedirs(\"files\")\r\n\r\n    threads = []\r\n\r\n    for _ in range(num_iterations):\r\n        user_agent = generate_user_agent()\r\n        thread = threading.Thread(target=download_images, args=(url, user_agent, \"files\/\"))\r\n        thread.start()\r\n        threads.append(thread)\r\n\r\n    for thread in threads:\r\n        thread.join()\r\n\r\n    end_time = time.time()  # \u8bb0\u5f55\u7ed3\u675f\u65f6\u95f4\r\n    execution_time = end_time - start_time\r\n\r\n    total_downloaded_size_mb = total_size_counter.get_total_size() \/ (1024 * 1024)\r\n    print(f\"Total downloaded size from all threads: {total_downloaded_size_mb:.2f} MB\")\r\n    print(f\"Script execution time: {execution_time:.2f} seconds\")\r\n\r\n    # \u5220\u9664\"files\"\u76ee\u5f55\u53ca\u5176\u5185\u5bb9\r\n    if os.path.exists(\"files\"):\r\n        for file_name in os.listdir(\"files\"):\r\n            file_path = os.path.join(\"files\", file_name)\r\n            os.remove(file_path)\r\n        os.rmdir(\"files\")\r\n\r\nif __name__ == \"__main__\":\r\n    if len(sys.argv) != 3:\r\n        print(\"Usage: python script.py <url> <num_iterations>\")\r\n    else:\r\n        url = sys.argv[1]\r\n        num_iterations = int(sys.argv[2])\r\n        main(url, num_iterations)<\/code><\/pre>\n\n\n\n

\u8bf7\u5728\u7a7a\u76ee\u5f55\u6c47\u603b\u8fd0\u884c\uff0c\u811a\u672c\u5c06\u81ea\u52a8\u5f53\u524d\u76ee\u5f55\u521b\u5efafiles<\/code>\u6587\u4ef6\u5939\uff0c\u811a\u672c\u6267\u884c\u5b8c\u6210\u540e\u5220\u9664\u6240\u6709\u4e0b\u8f7d\u7684\u6587\u4ef6\u3002<\/p>\n\n\n\n

\u811a\u672c\u5c06\u4f1a\u8ba1\u7b97\u6240\u6709\u4e0b\u8f7d\u7684\u5927\u5c0f\u4ee5\u53ca\u6267\u884c\u82b1\u8d39\u65f6\u95f4\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"

\u6700\u8fd1\u7f51\u7ad9\u88ab\u653b\u51fb\uff0ccdn\u4e00\u4e0b\u88ab\u62532TB\uff0c\u7136\u540e\u4e0b\u8f7d\u4e86\u65e5\u5fd7\u8fdb\u884c\u5206\u6790\uff0c\u4f46\u662f\u65e5\u5fd7\u6587\u4ef6\u6709\u51e0\u5341\u5146\uff0c\u6240\u4ee5\u9700\u8981\u505a\u4e00\u4e0b\u5207\u5272\uff0c\u8fd9\u91cc\u6211 […]<\/p>\n","protected":false},"author":1,"featured_media":43113,"comment_status":"open","ping_status":"closed","sticky":false,"template":"single-with-sidebar","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[146,283],"aioseo_notices":[],"_links":{"self":[{"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/posts\/49757"}],"collection":[{"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/comments?post=49757"}],"version-history":[{"count":1,"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/posts\/49757\/revisions"}],"predecessor-version":[{"id":50690,"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/posts\/49757\/revisions\/50690"}],"wp:featuredmedia":[{"embeddable":true,"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/media\/43113"}],"wp:attachment":[{"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/media?parent=49757"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/categories?post=49757"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.xyryd.com\/wp-json\/wp\/v2\/tags?post=49757"}],"curies":[{"name":"wp","href":"http:\/\/api.w.org\/{rel}","templated":true}]}}