Thursday, July 28, 2011

Simple Script to Download epaper from The Hindu


#!/bin/bash
#ishan dot karve at gmail dot com
#
#Script to download epaper from Hindu
#No more subscription .. pls donate the money to Prime Ministers Welfare Fund
#As always /// Its free to use...
#Get user to select edition
edition_choice=([0]=101 [1]=102 [2]=103 )
edition_name=([0]=Chennai [1]=Hyderabad [2]=Delhi )
echo "Hindu epaper editions are"
echo "-------------------------------------------------"
echo "0. Chennai"
echo "1. Hyderabad"
echo "2. Delhi"
echo "-------------------------------------------------"
while true; do
    read -p "Enter edition you wish to selec[0-2]: " ed
    case $ed in
       [012]) 
 echo "Thanks." 
 break;;
        * ) echo "Please select the correct numeric serial.";;
    esac
done
#spider the selected edition using wget to estimate number of pages
#define max incremental page limit
max_spider=100
echo "Estimating number of pages in ${edition_name[ed]} edition"
#start spider for main editon
 for ((  j = 1 ;  j <= $max_spider;  j++  ))
    do
      #prepend zero to single digits
      pageno=`printf "%03d" $j`  
      echo "Searching for Page $pageno"
      I_FILE="http://epaper.thehindu.com/pdf/`date +%Y`/`date +%m`/`date +%d`/`date +%Y``date +%m``date +%d`A_$pageno${edition_choice[ed]}.pdf"
 debug=`wget --spider $I_FILE 2>&1`
 
 echo $debug
      if [[ $debug =~ .*link!!!.* ]]
      then
      break
      fi
done
clear
#decrement counter
(( j = j - 1 ))
npages_A=$j
echo "Estimating number of pages in ${edition_name[ed]} edition supplement"



#start spider for newapaper supplement
 for ((  j = 1 ;  j <= $max_spider;  j++  ))
    do
      #prepend zero to single digits
      pageno=`printf "%03d" $j`  
      echo "Searching for Page $pageno"
      I_FILE="http://epaper.thehindu.com/pdf/`date +%Y`/`date +%m`/`date +%d`/`date +%Y``date +%m``date +%d`B_$pageno${edition_choice[ed]}.pdf"
 debug=`wget --spider $I_FILE 2>&1`
 
 echo $debug
      if [[ $debug =~ .*link!!!.* ]]
      then
      break
      fi
done
clear
#decrement counter
(( j = j - 1 ))
npages_B=$j

ty_dir="$HOME/Desktop/hindu_${edition_name[ed]}_`date +%d``date +%m``date +%Y`"
#mkdir to store individual pages
mkdir $ty_dir
echo "Please be patient..Bandwidth intensive operation starts..;-)"
echo "Downloading Main Paper .. total $npages_A pages"
    for ((  i = 1 ;  i <= npages_A;  i++  ))
    do
      #prepend zero to single digits
      pageno=`printf "%03d" $i`  
      echo "Downloading Page $pageno"
      O_FILE="$ty_dir/A$pageno.pdf"
      I_FILE="http://epaper.thehindu.com/pdf/`date +%Y`/`date +%m`/`date +%d`/`date +%Y``date +%m``date +%d`A_$pageno${edition_choice[ed]}.pdf"
      wget -q -O $O_FILE $I_FILE 
      
    done
    
echo "Downloading Supplement .. total $npages_B pages"

  for ((  i = 1 ;  i <= npages_B;  i++  ))
    do
      #prepend zero to single digits
      pageno=`printf "%03d" $i`  
      echo "Downloading Page $pageno"
      O_FILE="$ty_dir/B$pageno.pdf"
      I_FILE="http://epaper.thehindu.com/pdf/`date +%Y`/`date +%m`/`date +%d`/`date +%Y``date +%m``date +%d`B_$pageno${edition_choice[ed]}.pdf"
      wget -q -O $O_FILE $I_FILE 
      
    done
   

echo "Combining all pages into a single pdf document"
#combine multiple pdf files
gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=The_Hindu_${edition_name[ed]}_`date +%d``date +%b``date +%Y`.pdf -dBATCH $ty_dir/*.pdf
#empty directory
rm $ty_dir/*.*
#remove directory
rmdir $ty_dir
How to get it running

Copy the script to your Linux desktop
 go to command prompt using terminal
 type following commands

cd ~/Desktop
chmod +x thehindu.sh
./thehindu.sh

4 comments:

  1. any idea for downloading through windows OS...

    ReplyDelete
  2. How to download in android for delhi edition..i have app names terminal emulator

    ReplyDelete
  3. $ ./new.sh
    Hindu epaper editions are
    -------------------------------------------------
    0. Chennai
    1. Hyderabad
    2. Delhi
    -------------------------------------------------
    ./new.sh: line 18: syntax error near unexpected token `$'in\r''
    '/new.sh: line 18: ` case $ed in


    I am getting this error.. Please Resolve. :)

    ReplyDelete
  4. hi
    i have been downloading the epaper since a month
    and it was working fine: both via script or via internet browser direct http:// link

    but today onwards it's not working
    when i try on browser like
    "http://epaper.thehindu.com/pdf/2013/02/19/20130219A_001103.pdf "
    I get the error


    You don't have permission to access /pdf/2013/02/19/20130219A_001103.pdf on this server.

    "FOrbidden
    Apache/2.2.17 (Unix) mod_ssl/2.2.17 OpenSSL/0.9.8e-fips-rhel5 PHP/5.3.5 Server at epaper.thehindu.com Port 80
    "

    or if run script
    same http request fails - 403 FOrbidden

    HOw to resolve this issue now?

    ReplyDelete