User:Thumbnails Check Bot/stage-1.sh
Jump to navigation
Jump to search
#!/bin/bash
# add --silent as first param ($1) to see not some debug infos
workDir="/home/Thumbnails_Check_Bot"
lastRun="${workDir}/stage-1-last.log"
failLog="${workDir}/stage-1-fail.log"
# user-agent
userAgent="Thumbnails_Check_Bot/0.1 (http://commons.wikimedia.org/wiki/User:Thumbnails_Check_Bot; beta)"
# number of files each request (should not greater than 100 to prevent the length of the GET request)
limit=50
# do not scan files newer than N seconds (48h)
skipRecent=172800
# sleeptime (s) before second check if first was not found a valid thumbnail
sleepTime=30
CommonsUser="Thumbnails Check Bot"
CommonsPasswd="xxx"
#--------------------------------------------------------
# prepare Counter
countPages=0
countAll=0
countValid=0
countInvalid=0
countMissing=0
headError=0
detailsError=0
listError=0
countUnknown=0
startTime="`date --utc +%s`"
mimeLog="`tempfile`"
mimeLogFail="`tempfile`"
tmpfile="`tempfile`"
function writelog {
echo "startTime:${startTime}"
echo "countPages:${countPages}"
echo "countAll:${countAll}"
echo "countValid:${countValid}"
echo "countInvalid:${countInvalid}"
echo "countUnknown:${countUnknown}"
echo "countMissing:${countMissing}"
echo "headError:${headError}"
echo "detailsError:${detailsError}"
if test -e "${mimeLog}" ; then
cat "${mimeLog}" | sort | uniq -c | sed 's/^ [ ]*//g' | sed 's/ [ ]*/ /g' | while read line ; do
c="`echo \"${line}\" | cut -d ' ' -f 1`"
m="`echo \"${line}\" | cut -d ' ' -f 2`"
echo "countMimetype:${m}:${c}"
done
rm "${mimeLog}"
fi
if test -e "${mimeLogFail}" ; then
cat "${mimeLogFail}" | sort | uniq -c | sed 's/^ [ ]*//g' | sed 's/ [ ]*/ /g' | while read line ; do
c="`echo \"${line}\" | cut -d ' ' -f 1`"
m="`echo \"${line}\" | cut -d ' ' -f 2`"
echo "countMimetypeFail:${m}:${c}"
done
rm "${mimeLogFail}"
fi
# cleanup
test -e "${tmpfile}.head" && rm "${tmpfile}.head"
test -e "${tmpfile}.formated" && rm "${tmpfile}.formated"
test -e "${tmpfile}.titles" && rm "${tmpfile}.titles"
test -e "${tmpfile}.details" && rm "${tmpfile}.details"
test -e "${tmpfile}.addcat" && rm "${tmpfile}.addcat"
test -e "${tmpfile}" && rm "${tmpfile}"
endTime="`date --utc +%s`"
echo "endTime:${endTime}"
echo "runTime:`expr ${endTime} - ${startTime}`"
}
debug=true
if [ "$1" == "--silent" ] ; then
debug=false
fi
nextTime="`expr ${startTime} - ${skipRecent}`"
lastStart="`cat \"${lastRun}\"`"
while [ "${nextTime}" -gt "`expr ${lastStart} - ${skipRecent}`" ] ; do
countPages="`expr ${countPages} + 1`"
if curl --user-agent "${userAgent}" --get --silent --data "action=query" --data "list=allimages" --data "format=txt" --data "aidir=descending" --data "aisort=timestamp" --data "aiprop=mime" --data "ailimit=${limit}" --data "aistart=${nextTime}" "http://commons.wikimedia.org/w/api.php" > "${tmpfile}" ; then
titles="`cat \"${tmpfile}\" | sed 's/^ *//g' | grep '^\[title\] => '| cut -d ' ' -f 3- | grep -a '^File:' | tr '[\n]' '[|]' | sed 's/|$//g'`"
if curl --user-agent "${userAgent}" --get --silent --data-urlencode "titles=${titles}" --data "action=query" --data "format=txt" --data "prop=imageinfo" --data "iiprop=url|mime" --data "iiurlwidth=120" "http://commons.wikimedia.org/w/api.php" > "${tmpfile}.details" ; then
cat "${tmpfile}.details" | sed "s/^ *//g" | sed 's/^\[missing\] =>\(.*\)/[thumburl] => \n[mime] => \n/g' | grep '^\[\(title\|thumburl\|mime\)\] => ' > "${tmpfile}.formated"
for (( nr=1; nr<=${limit}; nr++ )) ; do
countAll="`expr ${countAll} + 1`"
title="`cat \"${tmpfile}.formated\" | grep '^\[\title\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
thumburl="`cat \"${tmpfile}.formated\" | grep '^\[\thumburl\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
mime="`cat \"${tmpfile}.formated\" | grep '^\[\mime\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
echo "${mime}" >> "${mimeLog}"
if [ "${thumburl}" != "" ] && [ "${mime}" != "" ] ; then
if curl --user-agent "${userAgent}" --silent --head "${thumburl}" 2>/dev/null > "${tmpfile}.head" ; then
http_code="`cat \"${tmpfile}.head\" | grep -m 1 '^HTTP/' | cut -d ' ' -f 2`"
if [ "${http_code}" != "200" ] ; then
# sleep and check again
sleep $sleepTime
if curl --user-agent "${userAgent}" --silent --head "${thumburl}" 2>/dev/null > "${tmpfile}.head.2nd" ; then
http_code_2nd="`cat \"${tmpfile}.head.2nd\" | grep -m 1 '^HTTP/' | cut -d ' ' -f 2`"
if [ "${http_code_2nd}" != "200" ] ; then
${debug} && echo "[-] ${title}"
echo "${http_code} ${http_code_2nd} ${mime} ${title}" >> "${failLog}"
echo "${mime}" >> "${mimeLogFail}"
countInvalid="`expr ${countInvalid} + 1`"
if python2.7 pywikipedia/login.py -test | grep -i -m 1 "not logged in" &> /dev/null ; then
python2.7 pywikipedia/login.py -user:"${CommonsUser}" -pass:"${CommonsPasswd}"
fi
# add cat
echo "${title}" > "${tmpfile}.addcat"
python2.7 pywikipedia/add_text.py -text:"[[Category:Possibly files without thumbnails detected by bot]]" -except:"\[\[Category:Possibly files without thumbnails detected by bot\]\]" -summary:"Bot: I did not found a valid thumbnail, so I add a category. (HTTP status code was ${http_code_2nd})" -file="${tmpfile}.addcat" -always
rm "${tmpfile}.addcat"
else
${debug} && echo "[+] ${title}"
countValid="`expr ${countValid} + 1`"
fi
else
${debug} && echo "[?] ${title}"
countUnknown="`expr ${countUnknown} + 1`"
#echo "header nicht ermittelbar"
fi
test -e "${tmpfile}.head.2nd" && rm "${tmpfile}.head.2nd"
else
${debug} && echo "[+] ${title}"
countValid="`expr ${countValid} + 1`"
fi
else
${debug} && echo "[?] ${title}"
countUnknown="`expr ${countUnknown} + 1`"
#echo "header nicht ermittelbar"
fi
test -e "${tmpfile}.head" && rm "${tmpfile}.head"
else
countMissing="`expr ${countMissing} + 1`"
#echo "missing file"
fi
done
test -e "${tmpfile}.formated" && rm "${tmpfile}.formated"
else
detailsError="`expr ${detailsError} + 1`"
#echo "imagedetails nicht downloadbar"
writelog
exit 1
fi
test -e "${tmpfile}.titles" && rm "${tmpfile}.titles"
test -e "${tmpfile}.details" && rm "${tmpfile}.details"
else
listError="`expr ${listError} + 1`"
#echo "liste nicht downloadbar"
writelog
exit 1
fi
nextStart="`tac ${tmpfile} | grep -m 1 -i -a \"\[aistart\]\" | sed 's/^ *//g' | cut -d ' ' -f 3-`"
nextTime="`date --utc --date=${nextStart} +%s`"
test -e "${tmpfile}" && rm "${tmpfile}"
done
writelog
echo ${startTime} > "${lastRun}"
exit 0