[Bash] PDF Scraping

Main Code block ‘pdfscraping.sh’

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#/bin/bash

# This script is to grep the useful data from PDF
# Pre-requisite: poppler-utils, mutt, ssmtp(/etc/ssmtp/ssmtp.conf), getmail4

CURRENT_DIR="$(pwd)"
FILE_LIST="$(ls -la | grep -E '*[.][pP][dD][fF]$')"
BANNER+="$(echo -e "#######################################\n--- XXX Vulnerability Report View
er ---\n------ Author: pippo9 Sep 2018 ------\n#######################################")"

# Define a usage() function
usage (){
echo "Usage: ${0} [-e RECEIVER]" >&2
echo "The script will automatically process any PDF files under the current directory, which
is ${CURRENT_DIR}." >&2
echo " -e Archieve PDF files and send an email." >&2
exit 1
}

# Allow user to specify the following options:
# Any other option will cause the script to display a usage statement
while getopts e: OPTION
do
case ${OPTION} in
e) SEND_EMAIL='true' RECEIVER="${OPTARG}" ;;
?) usage ;;
esac
done

# Ingore all the optional arguments
# Remove the options while leaving the remaining arguments.
# OPTIND is set to the index of the first non-option argument, and name is set to ?
shift "$(( OPTIND - 1 ))"

# If unable to matching any file with extention of ".pdf", ".PDF", etc.
if [[ "${FILE_LIST}" = "" ]]
then
usage
exit 1
fi

# Using awk to exclude Columns 1 to Columns 8
# each PDF file name will be saved into the below variable
FILE_NAME="$(echo "${FILE_LIST}" | awk '{$1=$2=$3=$4=$5=$6=$7=$8=""; print $0}' | sed 's/^[ \t
]*//')"

# Display script banner
echo "${BANNER}"
echo ""
# Declare an array ARR_LIST for archiving purposes
# Declare an array EMAIL_CONTENT used for email sending
declare -a ARR_LIST
declare -a EMAIL_CONTENT
EMAIL_CONTENT+=("${BANNER}")
EMAIL_CONTENT+=("")

# Using WHILE Loop to process all the PDF files
while read line
do
# Display the processing file name
NOW_PROCESSING=("Now processing ${line} ...")
echo "${NOW_PROCESSING}"
EMAIL_CONTENT+=("${NOW_PROCESSING}")
# Report generating date checking
# SERVER_NAME_IP=$(pdftotext -f 1 -l 1 "${line}" - | grep '(')
AUDITED=$(pdftotext "${line}" - | grep 'Audited on' | awk -F 'on ' '{print $2}')
REPORTED=$(pdftotext "${line}" - | grep 'Reported on' | awk -F 'on ' '{print $2}')
# If it's an invaild PDF
if [[ "${AUDITED}" = "" && "${REPORTED}" = "" ]]
then
TEMP1=$(echo "Whoops, looks like it is not a valid vulnerability report")
echo "${TEMP1}"
echo ""
EMAIL_CONTENT+=("${TEMP1}")
EMAIL_CONTENT+=("$(echo "")")
fi
# If 'audited date' equals 'reported date'
if [[ "${AUDITED}" = *"${REPORTED}"* && "${AUDITED}" != "" ]]
then
ARR_LIST+=("${line}")
TEMP2=$(pdftotext "${line}" - | grep -E 'vulnerability was discovered|vulnerabilities were
discovered' | cut -b 4-)
echo "${TEMP2}"
echo ""
EMAIL_CONTENT+=("${TEMP2}")
EMAIL_CONTENT+=("$(echo "")")
continue
fi
# If 'audited date' does not equal 'reported date'
if [[ "${AUDITED}" != "${REPORTED}" && "${AUDITED}" != "" ]]
then
ARR_LIST+=("${line}")
TEMP3_1=$(echo "Info: Audited Date does not equal Reported Date.")
TEMP3_2=$(echo "Audited on ${AUDITED}")
TEMP3_3=$(echo "Reported on ${REPORTED}")
echo "${TEMP3_1}"
echo "${TEMP3_2}"
echo "${TEMP3_3}"
echo ""
EMAIL_CONTENT+=("${TEMP3_1}")
EMAIL_CONTENT+=("${TEMP3_2}")
EMAIL_CONTENT+=("${TEMP3_3}")
EMAIL_CONTENT+=("$(echo "")")
fi
done <<< "$(echo -e "${FILE_NAME}")"

# Archieve and remove the PDFs in which Audited Date equals Reported Date
# FILE Naming Convention: XXX-$(date +%d%m%y).tar.gz
if [[ ${SEND_EMAIL} == "true" && "${ARR_LIST}" != "" ]]
then
# Archeving PDF files first
echo "The following PDFs have been archieved ..."
printf '%s\n' "${ARR_LIST[@]}"
echo ""
tar -czf XXX-$(date +%d%m%y).tar.gz "${ARR_LIST[@]}" --remove-files
# tar -czf XXX-$(date +%d%m%y).tar.gz "${ARR_LIST[@]}"
# If there's an error when archiving PDF files
if [[ "${?}" -ne 0 ]]
then
echo "Whoops, something wrong with the archieved file ..." >&2
exit 1
fi
# Send an email
echo "Sending an email to ${RECEIVER} ..."
printf '%s\n' "${EMAIL_CONTENT[@]}" | mutt -s "XXX-Report" "${RECEIVER}" -a XXX-$(date +%d%m
%y).tar.gz
if [[ "${?}" -eq 0 ]]
then
echo "Email successfully sent out ..."
exit 0
fi
fi

exit 0

A config file for SSMTP sendmail ‘/etc/ssmtp/ssmtp.conf’

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# /etc/ssmtp.conf -- a config file for sSMTP sendmail.
#
# See the ssmtp.conf(5) man page for a more verbose explanation of the
# available options.
#
# The person who gets all mail for userids < 1000
# Make this empty to disable rewriting.
root=xxxxx@gmail.com

# The place where the mail goes. The actual machine name is required
# no MX records are consulted. Commonly mailhosts are named mail.domain.com
# The example will fit if you are in domain.com and your mailhub is so named.
mailhub=smtp.gmail.com:465

# Example for SMTP port number 2525
# mailhub=mail.your.domain:2525
# Example for SMTP port number 25 (Standard/RFC)
# mailhub=mail.your.domain
# Example for SSL encrypted connection
# mailhub=mail.your.domain:465

# Where will the mail seem to come from?
#RewriteDomain=

# The full hostname
#Hostname=

# Set this to never rewrite the "From:" line (unless not given) and to
# use that address in the "from line" of the envelope.
FromLineOverride=YES

# Use SSL/TLS to send secure messages to server.
UseTLS=YES
#IMPORTANT: The following line is mandatory for TLS authentication
TLS_CA_File=/etc/pki/tls/certs/ca-bundle.crt

# Use SSL/TLS certificate to authenticate against smtp host.
#UseTLSCert=YES

# Use this RSA certificate.
#TLSCert=/etc/pki/tls/private/ssmtp.pem

# Get enhanced (*really* enhanced) debugging information in the logs
# If you want to have debugging of the config file parsing, move this option
# to the top of the config file and uncomment
# Debug=YES

AuthUser=xxxxx@gmail.com
AuthPass=xxxxx

A config file for mutt ‘~/.muttc’

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
set realname = "PD"
set from = "xxxxx@gmail.com"
set use_from = yes
set envelope_from = yes

set smtp_url = "smtp://xxxxx@smtp.gmail.com:587/"
set smtp_pass = "xxxxx"
set imap_user = "xxxxx@gmail.com"
set imap_pass = "xxxxx"
set folder = "imaps://imap.gmail.com:993"
set spoolfile = "+INBOX"
set ssl_force_tls = yes
set smtp_authenticators = 'gssapi:login''

set editor = "vim"
set charset = "utf-8"
set record = ''
bind index G imap-fetch-mail