bash图片爬取脚本
Tofloor
poster avatar
151******18
deepin
2018-11-27 07:04
Author
本帖最后由 sunowsir 于 2018-11-26 23:14 编辑

> 大佬不要嘲笑我,我还是个小白
> 在学习了python基础语法之后又看了看正则表达式的样例,然后,果断去用bash写了一个爬图片的脚本,,因为python不熟悉,这个脚本前前后后断断续续改了半个多月,终于差不多可以用了。
  1. #!/bin/bash

  2. #########################################################OTHER FUNCTIONS START

  3. # Help information. #
  4. Usage() {
  5.     echo "You need install wget , curl and change mode before use the script. "
  6.     echo "Install wget(Debain) : sudo apt-get install wget"
  7.     echo "Install curl(Debain) : sudo apt-get install curl"
  8.     echo "Change mode :          sudo chmod +x re_pic.sh"
  9.     echo
  10.     echo "Usage:    ./re_pic.sh [command] [parameter]"
  11.     echo
  12.     echo "    --help        Get help information."
  13.     echo "    -u --url:     Given a URL, crawl the image from that URL, Special characters in the URL need to be ‘\\’ for escaping"
  14.     echo
  15.     echo "    -p --path:    Given a path to save the doenloaded image. The default is to create the re_pic.d directory in the current directory of the script to save the pictures."
  16.     echo
  17.     echo "Statement: Please beware of copyright issues, everything related to image copyright issues is not related to this script."
  18.     echo "    -s --suffix    According to the download link of the picture to determine the type of picture, use this parameter to filter the type of picture the user needs to download."

  19.     echo "    -n --name      Determine the name of the image according to the download link of the picture, and use this parameter to screen the pictures that users need to download."
  20.     echo "    -w --word      After the image download link protocol, add the known missing path before the path."
  21.     exit 0
  22. }

  23. # Determine if the image exists, if there is a size of zero. #
  24. function Judge_img() {
  25.     # ${1} : ${save_path}
  26.     # ${2} : ${img_name}

  27.     # Determine if the image is duplicated. #
  28.     if [[ "$(ls ${1}${2} 2> /dev/null | grep -w ${2})" == "${2}" ]];then
  29.         return 2
  30.     # Determine if the image size is zero. #
  31.     elif [[ $(ls -al ${1}${2} 2> /dev/null | awk '{print $5}') -eq 0 ]];then
  32.         return 1
  33.     else
  34.         return 0
  35.     fi
  36. }

  37. # Download. #
  38. function Down_img() {
  39.     # ${1} : ${save_path}
  40.     # ${2} : ${img_name}
  41.     # ${3} : ${img_url}

  42.     wget -c -nv -nc --dns-timeout=5 --read-timeout=6 --connect-timeout=15 --output-document=${1}${2} ${3} > /dev/null 2>&1
  43.     wait
  44.    
  45.     # Try again, if this image download failed. #
  46.     try_num=5
  47.     while [[ $((try_num--)) ]];
  48.     do
  49.         Judge_img ${1} ${2}
  50.         if [[ ${?} -ne 0 ]];then
  51.             rm -rf ${1}${2}
  52.             wget -c -nv -nc --dns-timeout=5 --read-timeout=6 --connect-timeout=10 --output-document=${1}${2} ${3} > /dev/null 2>&1
  53.             wait
  54.         else
  55.             break
  56.         fi
  57.     done

  58.     # curl --retry 3 --connect-timeout 15 -# -o ${save_path}${img_name} ${img_url}

  59.     wait

  60.     Judge_img ${1} ${2}
  61.     if [[ ${?} -ne 0 ]];then
  62.         return 1
  63.     else
  64.         return 0
  65.     fi
  66. }

  67. function Para_analysis() {
  68.     suffix='**'
  69.     para_num=0
  70.     search_name='**'
  71.    
  72.     # Save all parameters to the array 'para[]'.  #
  73.     for arg in $@;
  74.     do
  75.         para[${para_num}]=${arg}
  76.         ((para_num++))
  77.     done
  78.    
  79.     for ((i = 0; i < para_num; i++));
  80.     do
  81.         case ${para[${i}]} in
  82.             "-u" | "--url" | "--URL" | "--Url")
  83.                 ((i++))
  84.                 source_url=${para[${i}]}
  85.             ;;
  86.             "-p"| "--path" | "--Path" | "--PATH")
  87.                 ((i++))
  88.                 save_path=${para[${i}]}
  89.                 if [[ "x$(echo "${save_path}" | grep -Eo '\/

  90. )" == "x" ]];then
  91.                     save_path="${save_path}/"
  92.                 fi
  93.             ;;
  94.             "-s" | "--suffix" | "--Suffix" | "--SUFFIX")
  95.                 ((i++))
  96.                 suffix=${para[${i}]}
  97.             ;;
  98.             "-n" | "--name" | "--Name" | "--NAME")
  99.                 ((i++))
  100.                 search_name=${para[${i}]}
  101.             ;;
  102.             "-w" | "--word" | "--Word" | "--WORD")
  103.                 ((i++))
  104.                 add_word=${para[${i}]}
  105.             ;;
  106.             *)
  107.                 Usage
  108.             ;;
  109.         esac
  110.     done
  111.    
  112. }

  113. function Get_code() {
  114.     # ${1} : ${source_url}
  115.    
  116.     # Get the web page code and save in .re_pic.get.webcode. #
  117.     curl --connect-timeout 10 ${1} -L -o ./.re_pic.get.webcode --silent
  118.     wait

  119.     try_num=5
  120.     while [[ $((try_num--)) ]];
  121.     do
  122.         if [[ "x$(cat ./.re_pic.get.webcode 2> /dev/null)" == "x" ]];then
  123.             curl --connect-timeout 10 ${1} -L -o ./.re_pic.get.webcode --silent
  124.             wait
  125.         else
  126.             break
  127.         fi
  128.         
  129.     done
  130.     wait
  131.    
  132.     # Print informathon of get web source code failed. #
  133.     if [[ "x$(cat ./.re_pic.get.webcode 2> /dev/null)" == "x" ]];then
  134.         return 1
  135.     else
  136.         return 0
  137.     fi
  138. }

  139. function Get_urls() {
  140.     # ${1} : ${suffix}

  141.     # Get download all images url. #
  142.     if [[ "x${suffix}" == "x" ]];then
  143.         urls=$(cat ./.re_pic.get.webcode | grep -Eo ']*' | tr -s '"' '\n' | tr -s "'" "\n" | grep -Eo '(\w*:)*(\/)*(\/\S+)+' | sort -u)
  144.     else
  145.         urls=$(cat ./.re_pic.get.webcode | grep -Eo ']*' | tr -s '"' '\n' | tr -s "'" "\n" | grep -Eo '(\w*:)*(\/)*(\/\S+)+' | grep "${1}" | sort -u)
  146.     fi
  147.    
  148.     if [[ "x${urls}" == "x" ]];then
  149.         return 1
  150.     else
  151.         return 0
  152.     fi
  153. }

  154. #########################################################OTHER FUNCTIONS END

  155. function main() {
  156.    
  157.     if [[ $1 == '--help' ]]; then
  158.         Usage
  159.     fi
  160.    
  161.     suffix='**'
  162.     para_num=0
  163.     search_name='**'
  164.     # Script directory.
  165.     nowpath=$(cd $(dirname "$0") && pwd)
  166.     cd ${nowpath}
  167.    
  168.     # Parameter analysis #
  169.     Para_analysis $@
  170.    
  171.     if [[ "x${source_url}" == "x" ]];then
  172.         echo -e "\033[1;31m URL is empty."
  173.         Usage
  174.     else
  175.         echo
  176.         echo -e "\033[1;32mSource : \033[1;34m${source_url}\033[0m"

  177.         # Get the URL protocol. #
  178.         source_protocol=$(echo "${source_url}" | grep -Eo 'http\w*:')
  179.     fi
  180.    
  181.     if [[ "x${save_path}" == "x" ]];then
  182.         touch re_pic.d
  183.         save_path="./re_pic.d"
  184.     fi
  185.    
  186.     # if don't hava the file ,touch the file. #
  187.     if [[ ! -f ./.re_pic.get.webcode ]];then
  188.         touch ./.re_pic.get.webcode
  189.     fi
  190.    
  191.     # Get the web code.  #
  192.     Get_code ${source_url}
  193.     if [[ ${?} -ne 0 ]];then
  194.         echo -e "\033[1;31mERROR\033[0m : Failed to get page source code."
  195.         return 0
  196.     fi
  197.    
  198.     Get_urls ${suffix}
  199.     if [[ ${?} -ne 0 ]];then
  200.         echo -e "\033[1;31mERROR\033[0m : No images in this URL."
  201.         return 0
  202.     else
  203.         total_num=0
  204.         failed_num=0
  205.         success_num=0
  206.     fi
  207.    
  208.     for img_url in `echo "${urls}" | tr -s " " "\n"`
  209.     do
  210.    
  211.         # Get the image name. #
  212.         img_name=$(echo "${img_url}" | grep -Eo '\S*\.\w\w\w' | xargs -I {} basename {})
  213.    
  214.         if [[ "x$(echo "${img_name}" | grep "${search_name}")" == "x" ]];then
  215.             continue;
  216.         fi
  217.    
  218.         # If there is no agreement in picture URL, add the agreement for hte URL's web address. #
  219.         if [[ "x$(echo "${img_url}" | grep -Eo 'http\S*')" == "x" ]];then
  220.             img_url=$(echo "${img_url}" | sed 's/^\/*//g')

  221.             # Add the user specified missing path to picture URL. #
  222.             if [[ "x${add_word}" != "x" ]];then
  223.                 img_url="${source_protocol}//${add_word}${img_url}"
  224.             else
  225.                 img_url="${source_protocol}//${img_url}"
  226.             fi
  227.         fi
  228.    
  229.         # If there is no suffix in the picture name obtained from picture URL, add the ".jpg"  suffix to the name of the picture. #
  230.         if [[ "x$(echo "${img_name}" | cut -d '.' -f 2)" == "x" ]];then
  231.             img_name="${img_name}.jpg"
  232.         fi
  233.    
  234.         # Skip repeating pictures. #
  235.         Judge_img ${save_path} ${img_name}
  236.         if [[ ${?} -eq 0 ]];then
  237.             echo -e "\033[1;33mWARNING\033[0m : The image \"${img_name:0:25}\" already exists"
  238.             ((total_num++))
  239.             ((failed_num++))
  240.             continue
  241.         elif [[ ${?} -eq 1 ]];then
  242.             rm -rf ${save_path}${img_name}
  243.         fi
  244.         
  245.         # Download image from ${img_url} #
  246.         Down_img ${save_path} ${img_name} ${img_url}
  247.    
  248.         if [[ ${?} -eq 0 ]];then
  249.             ((success_num++))
  250.             echo -e "\033[1;32m${img_url:0:30}... -> ${save_path:0:25}.../...${img_name:0:25}\033[0m"
  251.         else
  252.             ((failed_num++))
  253.             echo -e "\033[31;mERROR\033[0m : The image \"${path_name:(-30)}\" download failed."
  254.         fi
  255.    
  256.         ((total_num++))
  257.     done
  258.    
  259.     wait
  260.    
  261.     # Print statistics. #
  262.     echo -e "\033[1;32mTotal : ${total_num} \033[1;32mSuccess : ${success_num} \033[1;31mFailure : ${failed_num}\033[0m"
  263.    
  264.     # delete the file of save web source code, if the file in now path. #
  265.     if [[ -f ./.re_pic.get.webcode ]];then
  266.         rm -rf ./.re_pic.get.webcode
  267.     fi
  268.    
  269.     return 0
  270. }

  271. main $@
  272. wait

  273. # Print the end information. #
  274. echo -e "\033[1;38mDone.\033[0m"
  275. exit 0
Copy the Code
https://github.com/sunowsir/MyShell

Reply Favorite View the author
All Replies
avatar
dot_8
deepin
2018-11-27 07:36
#1
外行人表示 看不懂~
Reply View the author