#!/bin/bash
# @file
# @brief Import Server Logs for Piwik from Hosts
# @see http://edoceo.com/pub/piwik-import.sh
# @see http://forum.piwik.org/read.php?2,98270,98270

#
# Apache Config:
#    LogFormat "%V %h %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" vhost
#    CustomLog /var/log/apache2/access.log vhost

#
# Lighttpd Config:
#    accesslog.format     = "%V %h %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
#    accesslog.filename   = var.logdir + "/access.log"

#
# Nginx Config:
#    log_format vhost '$http_host $remote_addr - [$time_local] "$request" $status $bytes_sent "$http_referer" "$http_user_agent"';
#    access_log /var/log/nginx/access.log vhost;

piwik_auth=""
piwik_root="/opt/piwik"
piwik_conf="$piwik_root/config/config.ini.php"
piwik_site="http://your.host/piwik"

host_kind_list="
www1.host.tld:apache2
www2.host.tld:lighttpd
www3.host.tld:nginx
"

cpu=$(cat /proc/cpuinfo |grep processor|wc -l)
cpu=$(( $cpu / 3 ))
cpu=$(( $cpu + 1 ))

# You can use this
add_new_sites="--add-sites-new-hosts"
add_new_sites=""

# Add --debug
debug="--debug --debug --debug"
debug="--debug"
debug=""


show_progress="--show-progress --show-progress-delay=2"
show_progress=""

#
# Import the Log Files
function do_piwik
{
	file="$1"
	site="${2:-}"
	if [ -n "$site" ]
	then
		site="--idsite=$site"
	fi

	lc=$(wc -l $file)
	echo "Lines to Process: $lc"

	python /opt/piwik/misc/log-analytics/import_logs.py \
		$debug \
		$add_new_sites \
		$show_progress \
		--url=$piwik_site \
		--idsite-fallback=3 \
		--log-format-regex='(?P<host>\S+) (?P<ip>\S+) (?P<user>\S+) \[(?P<date>[\w\/\:]+) (?P<timezone>[\d\-\+]+)\] "\w+ (?P<path>.*?)(?: \S+)" (?P<status>\d+) (?P<length>\d+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)"' \
		--enable-bots \
		--enable-http-errors \
		--enable-http-redirects \
		--enable-static \
		--strip-query-string \
		--recorders=$cpu \
		"$file" \
		2>&1 >> /var/log/piwik-import.log
}

#
# Remove Old Logs
rm -fr /var/log/piwik-*log


#
# Load From Apache Hosts
for host_kind in $host_kind_list
do

	kind=${host_kind#*:}
	host=${host_kind%:*}

	echo "Processing: $host ($kind)"

	d=$(mktemp -d)
	cd "$d"

	# If Host the Remove Old Logs
	if [ -n "$host" ]
	then
		rsync -a --delete $host:/var/log/$kind/ ./
		ssh $host "rm /var/log/$kind/*log; /etc/init.d/$kind reload >/dev/null;"
	else
		rsync -a --delete /var/log/$kind/ ./
		rm /var/log/$kind/*log
		/etc/init.d/$kind reload >/dev/null
	fi

	# ls *.gz

	do_piwik access*log

	# echo "Errors from: $host" >> /tmp/webserver-errors.log
	# cat error*log >> /tmp/webserver-errors.log

	cd - >/dev/null
	rm -fr "$d"

done

#
# Now run the Archiver
php "$piwik_root/misc/cron/archive.php" --url=$piwik_site 2>&1 >>/var/log/piwik-archive.log

# tail -n 20 /var/log/piwik-archive.log
echo
# echo "Errors:"
grep -i -e 'error' -e 'fatal' -e 'warn' \
	/var/log/piwik-import.log /var/log/piwik-archive.log \
	| grep -v '0 HTTP' | grep -v 'without error' | grep -v 'no error' || true

#
# Produce a Report of NotFound Sites
# grep 'No Piwik' /var/log/piwik-import.log | cut -d' ' -f4- | sort | uniq
# grep 'Invalid line' /var/log/piwik-*log | cut -d' ' -f4-
