# deepcrawler script to run the Nutch bot for crawling and re-crawling. # Usage: bin/deepcrawler [safe] # If executed in 'safe' mode, it doesn't delete the temporary # directories generated during crawl. This might be helpful for # analysis and recovery in case a crawl fails. # # Author: Susam Pal depth=6 threads=20 adddays=5 topN=5000 #Comment this statement if you don't want to set topN value # Arguments for rm and mv RMARGS="-rf" MVARGS="--verbose" # Parse arguments if [ "$1" == "safe" ] then safe=yes fi if [ -z "$NUTCH_HOME" ] then NUTCH_HOME=. echo deepcrawler: $0 could not find environment variable NUTCH_HOME echo deepcrawler: NUTCH_HOME=$NUTCH_HOME has been set by the script else echo deepcrawler: $0 found environment variable NUTCH_HOME=$NUTCH_HOME fi if [ -z "$CATALINA_HOME" ] then CATALINA_HOME=/opt/tomcat echo deepcrawler: $0 could not find environment variable NUTCH_HOME echo deepcrawler: CATALINA_HOME=$CATALINA_HOME has been set by the script else echo deepcrawler: $0 found environment variable CATALINA_HOME=$CATALINA_HOME fi if [ -n "$topN" ] then topN="-topN $topN" else topN="" fi steps=10 echo "----- Inject (Step 1 of $steps) -----" $NUTCH_HOME/bin/nutch inject crawl1/crawldb urls/seed echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----" for((i=0; i < $depth; i++)) do echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---" $NUTCH_HOME/bin/nutch generate crawl1/crawldb crawl1/segments $topN \ -adddays $adddays if [ $? -ne 0 ] then echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch." break fi segment1=`ls -d crawl1/segments/* | tail -1` $NUTCH_HOME/bin/nutch fetch $segment1 -threads $threads if [ $? -ne 0 ] then echo "deepcrawler: fetch $segment1 at depth `expr $i + 1` failed." echo "deepcrawler: Deleting segment $segment1." rm $RMARGS $segment1 continue fi $NUTCH_HOME/bin/nutch updatedb crawl1/crawldb $segment1 done echo "----- Generate, Fetch, Parse, Update (Step 3 of $steps) -----" for((i=0; i < $depth; i++)) do echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---" $NUTCH_HOME/bin/nutch generate crawl1/crawldb crawl1/segments topN 1000 \ -adddays $adddays if [ $? -ne 0 ] then echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch." break fi segment2=`ls -d crawl1/segments/* | tail -1` $NUTCH_HOME/bin/nutch fetch $segment2 -threads $threads if [ $? -ne 0 ] then echo "deepcrawler: fetch $segment2 at depth `expr $i + 1` failed." echo "deepcrawler: Deleting segment $segment2." rm $RMARGS $segment2 continue fi $NUTCH_HOME/bin/nutch updatedb crawl1/crawldb $segment2 done echo "----- Generate, Fetch, Parse, Update (Step 4 of $steps) -----" for((i=0; i < $depth; i++)) do echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---" $NUTCH_HOME/bin/nutch generate crawl1/crawldb crawl1/segments topN 1000 \ -adddays $adddays if [ $? -ne 0 ] then echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch." break fi segment3=`ls -d crawl1/segments/* | tail -1` $NUTCH_HOME/bin/nutch fetch $segment3 -threads $threads if [ $? -ne 0 ] then echo "deepcrawler: fetch $segment3 at depth `expr $i + 1` failed." echo "deepcrawler: Deleting segment $segment3." rm $RMARGS $segment3 continue fi $NUTCH_HOME/bin/nutch updatedb crawl1/crawldb $segment3 done echo "----- Merge Segments (Step 5 of $steps) -----" $NUTCH_HOME/bin/nutch mergesegs crawl1/MERGEDsegments crawl1/segments/* if [ "$safe" != "yes" ] then rm $RMARGS crawl1/segments else rm $RMARGS crawl1/BACKUPsegments mv $MVARGS crawl1/segments crawl1/BACKUPsegments fi mv $MVARGS crawl1/MERGEDsegments crawl1/segments echo "----- Invert Links (Step 6 of $steps) -----" $NUTCH_HOME/bin/nutch invertlinks crawl1/linkdb crawl1/segments/* echo "----- Index (Step 7 of $steps) -----" $NUTCH_HOME/bin/nutch index crawl1/NEWindexes crawl1/crawldb crawl1/linkdb \ crawl1/segments/* echo "----- Dedup (Step 8 of $steps) -----" $NUTCH_HOME/bin/nutch dedup crawl1/NEWindexes echo "----- Merge Indexes (Step 9 of $steps) -----" $NUTCH_HOME/bin/nutch merge crawl1/NEWindex crawl1/NEWindexes echo "----- Loading New Index (Step 10 of $steps) -----" ${CATALINA_HOME}/bin/shutdown.sh if [ "$safe" != "yes" ] then rm $RMARGS crawl1/NEWindexes rm $RMARGS crawl1/index else rm $RMARGS crawl1/BACKUPindexes rm $RMARGS crawl1/BACKUPindex mv $MVARGS crawl1/NEWindexes crawl1/BACKUPindexes mv $MVARGS crawl1/index crawl1/BACKUPindex fi mv $MVARGS crawl1/NEWindex crawl1/index ${CATALINA_HOME}/bin/startup.sh echo "deepcrawler: FINISHED: Crawl completed!" echo ""