diff --git a/defaults/main.yml b/defaults/main.yml index 040b08d..678ac9b 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -107,3 +107,4 @@ common_iptables_v4: "iptables_default_v4.j2" common_iptables_v6: "iptables_default_v6.j2" common_snapper: False common_smartd: False +common_zfs: False diff --git a/files/scripts/zfs_health.sh b/files/scripts/zfs_health.sh new file mode 100755 index 0000000..0a29b51 --- /dev/null +++ b/files/scripts/zfs_health.sh @@ -0,0 +1,122 @@ +#! /bin/sh +# +# Calomel.org +# https://calomel.org/zfs_health_check_script.html +# FreeBSD ZFS Health Check script +# zfs_health.sh @ Version 0.17 + +# Check health of ZFS volumes and drives. On any faults send email. + + +# 99 problems but ZFS aint one +problems=0 + + +# Health - Check if all zfs volumes are in good condition. We are looking for +# any keyword signifying a degraded or broken array. + +condition=$(/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)') +if [ "${condition}" ]; then + emailSubject="`hostname` - ZFS pool - HEALTH fault" + problems=1 +fi + + +# Capacity - Make sure the pool capacity is below 80% for best performance. The +# percentage really depends on how large your volume is. If you have a 128GB +# SSD then 80% is reasonable. If you have a 60TB raid-z2 array then you can +# probably set the warning closer to 95%. +# +# ZFS uses a copy-on-write scheme. The file system writes new data to +# sequential free blocks first and when the uberblock has been updated the new +# inode pointers become valid. This method is true only when the pool has +# enough free sequential blocks. If the pool is at capacity and space limited, +# ZFS will be have to randomly write blocks. This means ZFS can not create an +# optimal set of sequential writes and write performance is severely impacted. + +maxCapacity=80 + +if [ ${problems} -eq 0 ]; then + capacity=$(/sbin/zpool list -H -o capacity | cut -d'%' -f1) + for line in ${capacity} + do + if [ $line -ge $maxCapacity ]; then + emailSubject="`hostname` - ZFS pool - Capacity Exceeded" + problems=1 + fi + done +fi + + +# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors +# on all volumes and all drives using "zpool status". If any non-zero errors +# are reported an email will be sent out. You should then look to replace the +# faulty drive and run "zpool scrub" on the affected volume after resilvering. + +if [ ${problems} -eq 0 ]; then + errors=$(/sbin/zpool status | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000) + if [ "${errors}" ]; then + emailSubject="`hostname` - ZFS pool - Drive Errors" + problems=1 + fi +fi + + +# Scrub Expired - Check if all volumes have been scrubbed in at least the last +# 8 days. The general guide is to scrub volumes on desktop quality drives once +# a week and volumes on enterprise class drives once a month. You can always +# use cron to schedual "zpool scrub" in off hours. We scrub our volumes every +# Sunday morning for example. +# +# Scrubbing traverses all the data in the pool once and verifies all blocks can +# be read. Scrubbing proceeds as fast as the devices allows, though the +# priority of any I/O remains below that of normal calls. This operation might +# negatively impact performance, but the file system will remain usable and +# responsive while scrubbing occurs. To initiate an explicit scrub, use the +# "zpool scrub" command. +# +# The scrubExpire variable is in seconds. So for 8 days we calculate 8 days +# times 24 hours times 3600 seconds to equal 691200 seconds. + +scrubExpire=691200 + +if [ ${problems} -eq 0 ]; then + currentDate=$(date +%s) + zfsVolumes=$(/sbin/zpool list -H -o name) + + for volume in ${zfsVolumes} + do + if [ $(/sbin/zpool status $volume | egrep -c "none requested") -ge 1 ]; then + printf "ERROR: You need to run \"zpool scrub $volume\" before this script can monitor the scrub expiration time." + break + fi + if [ $(/sbin/zpool status $volume | egrep -c "scrub in progress|resilver") -ge 1 ]; then + break + fi + + ### Ubuntu with GNU supported date format + scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}') + scrubDate=$(date -d "$scrubRawDate" +%s) + + ### FreeBSD with *nix supported date format + #scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $15 $12 $13}') + #scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s) + + if [ $(($currentDate - $scrubDate)) -ge $scrubExpire ]; then + emailSubject="`hostname` - ZFS pool - Scrub Time Expired. Scrub Needed on Volume(s)" + problems=1 + fi + done +fi + + +# Email - On any problems send email with drive status information and +# capacities including a helpful subject line. Also use logger to write the +# email subject to the local logs. This is also the place you may want to put +# any other notifications like playing a sound file, beeping the internal +# speaker, paging someone or updating Nagios or even BigBrother. + +if [ "$problems" -ne 0 ]; then + printf '%s\n' "$emailSubject" "" "`/sbin/zpool list`" "" "`/sbin/zpool status`" | /usr/bin/mail -s "$emailSubject" root@localhost + logger $emailSubject +fi diff --git a/files/scripts/zfs_mount.sh b/files/scripts/zfs_mount.sh new file mode 100755 index 0000000..d73f828 --- /dev/null +++ b/files/scripts/zfs_mount.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# https://bitbucket.org/dewoodruff/zfs-on-linux-luks-mountvolumes/src/5836def278a3e462f1f508ba02b7fa236dd28717/mountVolumes.sh + +. /etc/zfs_mount_settings.sh + +# the real work happens below +activePools=() +date >> $LOG +function getPoolStatus { + echo "Checking pool status:" | tee -a $LOG + for pool in "${pools[@]}" + do + echo -en "\t$pool: " | tee -a $LOG + status=`zpool status $pool 2>&1 | grep "state:" | cut -f2 -d:` + if [ -z "$status" ]; + then + echo "unknown - not imported" | tee -a $LOG + else + echo $status | tee -a $LOG + activePools+=($pool) + fi + done +} + +function exportActivePools { + if [ -n "$activePools" ]; + then + echo -n "Exporting pools... " | tee -a $LOG + for pool in "${activePools[@]}" + do + zpool export -f $pool 2>&1 1>>$LOG || { echo "Problem exporting $pool!" | tee -a $LOG; exit 0; } + done + echo " done." + fi +} + +function importPools { + echo -n "Importing pools..." + for pool in "${pools[@]}" + do + zpool import $pool 2>&1 1>>$LOG || { echo "Problem importing $pool!" | tee -a $LOG; exit 0; } + done + echo " done." +} + +function closeAllLUKS { + echo "Making sure all LUKS disks are closed..." + for dev in "${devs[@]}" + do + #echo $dev + cryptsetup close $dev 2>&1 | 1>>$LOG || { echo "Problem closing $dev!" | tee -a $LOG; exit 0; } + done + echo "Done." +} + +function openAllLUKS { + read -s -p "Enter LUKS passphrase: " pass1 + echo "" + read -s -p "Confirm LUKS passphrase: " pass2 + echo "" + + if [ "$pass1" = "$pass2" ]; + then + for dev in "${!devs[@]}" + do + echo "Opening $dev to ${devs["$dev"]}" | tee -a $LOG + echo "$pass1" | cryptsetup luksOpen $dev ${devs[$dev]} 2>&1 1>>$LOG || { echo "Problem opening $dev!" | tee -a $LOG; exit 0; } + done + else + echo "ERROR: passphrases don't match!" + fi + pass1="" + pass2="" +} + +function LUKSStatus { + for dev in "${devs[@]}" + do + cryptsetup status $dev | head -1 | tee -a $LOG + done | sort +} + +function unmount { + zfs unshare -a + getPoolStatus + exportActivePools + closeAllLUKS + getPoolStatus +} + +if [ "$1" = "status" ]; +then + LUKSStatus + getPoolStatus +elif [ "$1" = "mount" ]; +then + getPoolStatus + exportActivePools + closeAllLUKS + openAllLUKS + importPools + getPoolStatus + zfs share -a +elif [ "$1" = "unmount" ]; +then + unmount +elif [ "$1" = "reboot" ]; +then + unmount + reboot +elif [ "$1" = "shutdown" ]; +then + unmount + shutdown -h now +elif [ "$1" = "freespace" ]; +then + zfs list +else + echo "Usage: ./mountVolumes.sh [status|mount|unmount|reboot|shutdown|freespace]" +fi diff --git a/handlers/main.yml b/handlers/main.yml index 5d6b462..eb7b688 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -40,3 +40,12 @@ - name: restart smartd service: name=smartd state=restarted + +- name: zfs - start services + service: "name={{ item }} state=restarted" + with_items: + - zfs-import-cache + - zfs-import-scan + - zfs-mount + - zfs-share + listen: zfs restart diff --git a/tasks/main.yml b/tasks/main.yml index db08dbb..447eb79 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -73,3 +73,6 @@ when: common_smartd tags: ['common', 'smartd'] +- import_tasks: zfs.yml + when: common_zfs + tags: ['common', 'zfs'] diff --git a/tasks/zfs.yml b/tasks/zfs.yml new file mode 100644 index 0000000..4d6e7bb --- /dev/null +++ b/tasks/zfs.yml @@ -0,0 +1,43 @@ +--- +- name: zfs - linux-headers + package: name=linux-headers-{{ ansible_kernel }} + +- name: zfs - install ZoL dkms + package: name=zfs-dkms + +- name: zfs - install ZoL utils + package: name=zfsutils-linux + +- name: zfs - install zfs-auto-snapshot + package: name=zfs-auto-snapshot + +- name: zfs - zfs-auto-snapshot find cron files + shell: find /etc/cron* -type f -name zfs-auto-snapshot + register: snapshot_cron + changed_when: False + +- name: zfs - zfs-auto-snapshot prefix + lineinfile: + path: "{{ item }}" + regexp: (.*zfs-auto-snapshot.*\d+)\ \ \/\/ + line: \1 --prefix= // + backrefs: yes + with_items: "{{ snapshot_cron.stdout_lines }}" + when: snapshot_cron.stdout_lines is defined + +- name: zfs - load module + modprobe: name=zfs + notify: zfs restart + +- name : zfs - zfs_mount.sh + copy: src=scripts/zfs_mount.sh dest=/usr/local/bin/ owner=root group=root mode=0755 + +- name : zfs - zfs_health.sh + copy: src=scripts/zfs_health.sh dest=/usr/local/bin/ owner=root group=root mode=0755 + +- name: zfs - zfs_health cronjob + cron: + name: zfs check health + minute: 0 + hour: "7,11,16" + job: "/usr/local/bin/zfs_health.sh"