how to pass list to parallel - parallel-processing

I am trying to use parallel in following script
#!/bin/bash
declare -a ephemeral_list
for mount in $(lsblk | grep ^x | awk '{ print $1 }')
do
if ! mount | grep $mount >/dev/null; then
ephemeral_list+=($mount)
fi
done
for i in "${!ephemeral_list[#]}"
do
printf "%s\t%s\n" "$i" "${ephemeral_list[$i]}"
[ -d /mnt/ephemeral$i ] || mkdir /mnt/ephemeral$i
mkfs.ext4 -E nodiscard /dev/${ephemeral_list[$i]} && mount /dev/${ephemeral_list[$i]} /mnt/ephemeral$i &
done
I want to run "mkfs.ext4 -E nodiscard /dev/${ephemeral_list[$i]} && mount /dev/${ephemeral_list[$i]} /mnt/ephemeral$i &" command on each cpu here
any help ?
thanks

Make a function. Call that.
mymkfs() {
printf "%s\t%s\n" "$1" "$2"
[ -d /mnt/ephemeral$1 ] || mkdir /mnt/ephemeral$1
mkfs.ext4 -E nodiscard /dev/"$2" && mount /dev/"$2" /mnt/ephemeral$1
}
export -f mymkfs
parallel mymkfs {#} {} ::: "${ephemeral_list[#]}"

Related

Bash script functions overflowing into others

Morning,
I'm trying to consolidate a number of smaller scripts into a single large bash script where everything is called via functions.
Most functions will function fine (i.e. script.sh update), however giving script.sh status for example will start giving errors related to the docker() function.
I've corrected all the errors I can via shellcheck and tried adding return to each function but it's still pulling incorrect functions.
Here is the script in full:
#!/bin/bash
# variables and arguments
main() {
export XZ_OPT=-e9
distro=$(awk -F'"' '/^NAME/ {print $2}' /etc/os-release)
username=$(grep home /etc/passwd | sed 1q | cut -f1 -d:)
directory_home="/home/$username"
directory_script="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
rclone_command="rclone --config=$directory_script/rclone.conf"
docker_restart=("flexget" "cbreader" "syncthing")
args "$#"
}
args() {
action=$1
case "$action" in
archive) archive ;;
borg) borg ;;
docker) docker ;;
logger) logger ;;
magnet) magnet ;;
payslip) payslip ;;
permissions) permissions ;;
rclone) rclone_mount ;;
sshfs) sshfs_mount ;;
status) status ;;
sync) sync ;;
update) update ;;
*) echo "$0" && available_options ;;
esac
}
# functions
function available_options() {
sed -n '/^\tcase/,/\tesac$/p' "$0" | cut -f1 -d")" | sed '1d;$d' | sort | tr -d "*" | xargs
return
}
function plural() {
if (("$1">1))
then
echo s
fi
return
}
function dir_find() {
find "$directory_home" -maxdepth 3 -mount -type d -name "$1"
return
}
function domain_find() {
file_config_traefik="$(dir_find config)/traefik/traefik.toml"
awk -F'"' '/domain/ {print $2}' "$file_config_traefik"
return
}
function git_config() {
git config --global user.email "$username#$(domain_find)"
git config --global user.name "$username"
git config pack.windowMemory 10m
git config pack.packSizeLimit 20m
return
}
function delete_docker_env() {
if [[ -f "$directory_script/.env" ]]
then
echo Deleting existing env file
rm "$directory_script/.env"
fi
return
}
function delete_docker_compose() {
if [[ -f "$directory_script/docker-compose.yml" ]]
then
echo Deleting existing env file
rm "$directory_script/docker-compose.yml"
fi
return
}
function write_docker_env() {
{
printf "NAME=%s\\n" "$username"
printf "PASS=%s\\n" "$docker_password"
printf "DOMAIN=%s\\n" "$(domain_find)"
printf "PUID=%s\\n" "$(id -u)"
printf "PGID=%s\\n" "$(id -g)"
printf "TZ=%s\\n" "$(cat /etc/timezone)"
printf "HOMEDIR=%s\\n" "$directory_home"
printf "CONFDIR=%s\\n" "$(dir_find config)"
printf "DOWNDIR=%s\\n" "$(dir_find downloads)"
printf "POOLDIR=%s\\n" "$(dir_find media)"
printf "SAVEDIR=%s\\n" "$(dir_find saves)"
printf "SYNCDIR=%s\\n" "$(dir_find vault)"
printf "WORKDIR=%s\\n" "$(dir_find paperwork)"
printf "RCLONE_REMOTE_MEDIA=%s\\n" "$(rclone_remote media)"
printf "RCLONE_REMOTE_SAVES=%s\\n" "$(rclone_remote saves)"
printf "RCLONE_REMOTE_WORK=%s\\n" "$(rclone_remote work)"
} > "$directory_script/.env"
return
}
function payslip_config_write() {
{
printf "[retriever]\\n"
printf "type = SimpleIMAPSSLRetriever\\n"
printf "server = imap.yandex.com\\n"
printf "username = %s\\n" "$payslip_username"
printf "port = 993\\n"
printf "password = %s\\n\\n" "$payslip_password"
printf "[destination]\\n"
printf "type = Maildir\\n"
printf "path = %s/\\n" "$directory_temp"
} > getmailrc
return
}
function payslip_decrypt() {
cd "$(dir_find paperwork)" || exit
for i in *pdf
do
fileProtected=0
qpdf "$i" --check || fileProtected=1
if [ $fileProtected == 1 ]
then
qpdf --password=$payslip_encryption --decrypt "$i" "decrypt-$i" && rm "$i"
fi
done
return
}
function rclone_remote() {
$rclone_command listremotes | grep "$1"
return
}
function check_running_as_root() {
if [ "$EUID" -ne 0 ]
then
echo "Please run as root"
exit 0
fi
return
}
function include_credentials() {
source "$directory_script/credentials.db"
return
}
function archive() {
rclone_remote=$(rclone_remote backups)
working_directory=$(dir_find backups)/archives
echo "$working_directory"
if [ -z "$*" ]
then
echo Creating archives...
# build folder array?
cd "$(mktemp -d)" || exit
for i in "config" "vault"
do
tar -cJf "backup-$i-$(date +%Y-%m-%d-%H%M).tar.xz" --ignore-failed-read "$HOME/$i"
done
echo "Sending via rclone..."
for i in *
do
du -h "$i"
$rclone_command move "$i" "$rclone_remote"/archives/
done
echo Cleaning up...
rm -r "$PWD"
echo Done!
else
echo Creating single archive...
cd "$(mktemp -d)" || exit
tar -cJf "backup-$1-$(date +%Y-%m-%d-%H%M).tar.xz" --ignore-failed-read "$directory_home/$1"
echo "Sending via rclone..."
for i in *
do
du -h "$i" && $rclone_command move "$i" "$rclone_remote"/archives/
done
echo Cleaning up...
rm -r "$PWD"
echo Done!
fi
return
}
function update-arch() {
if [ -x "$(command -v yay)" ]
then
yay -Syu --noconfirm
else
pacman -Syu --noconfirm
fi
return
}
function update-debian() {
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get dist-upgrade -y
apt-get autoremove --purge -y
apt-get clean
if [ -x "$(command -v youtube-dl)" ]
then
youtube-dl -U
fi
if [ -x "$(command -v rclone)" ]
then
curl --silent "https://rclone.org/install.sh" | bash
fi
return
}
function update-remaining() {
if [ -f "$directory_home/.config/retroarch/lrcm/lrcm" ]
then
"$directory_home/.config/retroarch/lrcm/lrcm" update
fi
find "$(dir_find config)" -maxdepth 2 -name ".git" -type d | sed 's/\/.git//' | xargs -P10 -I{} git -C {} pull
if [ -x "$(command -v we-get)" ]
then
pip3 install --upgrade git+https://github.com/rachmadaniHaryono/we-get
fi
if [ -x "$(command -v plowmod)" ]
then
su -c "plowmod -u" -s /bin/sh "$username"
chown -R "$username":"$username" "$directory_home/.config/plowshare"
fi
return
}
function borg() {
# https://opensource.com/article/17/10/backing-your-machines-borg
working_directory=$(dir_find backups)/borg
echo "$working_directory"
return
}
function docker() {
delete_docker_env
# delete_docker_compose
include_credentials
# update submodules
git pull --recurse-submodules
# write compose file
# {
# printf "nope"
# } > docker-compose.yml
# write env file
write_docker_env
# clean up existing stuff
echo Cleaning up existing docker files
for i in volume image system network
do
docker "$i" prune -f
done
docker system prune -af
# make network, if not existing
if ! printf "$(docker network ls)" | grep -q "proxy"
then
echo Creating docker network
docker network create proxy
fi
# start containers
echo Starting docker containers
docker-compose up -d --remove-orphans
delete_docker_env
return
}
function logger() {
git_config
git_directory="$(dir_find logger)"
file_git_log="$git_directory/media.log"
log_command="git --git-dir=$git_directory/.git --work-tree=$git_directory"
log_remote=$(rclone_remote media)
if [ ! -e "$git_directory" ]
then
mkdir "$git_directory" # make log directory
fi
if [ ! -e "$git_directory/.git" ]
then
$log_command init # initialise git repo
fi
if [ -e "$file_git_log.xz" ]
then
xz -d "$file_git_log.xz" # if xz archive exists, decompress
fi
if [ -e "$file_git_log" ]
then
rm "$file_git_log"
fi
$rclone_command ls "$log_remote" | sort -k2 > "$file_git_log" # create log
$rclone_command size "$log_remote" >> "$file_git_log" # append size
$log_command add "$file_git_log" # add log file
$log_command commit -m "Update: $(date +%Y-%m-%d)" # commit to repo, datestamped
if [ -e "$file_git_log.xz" ]
then
rm "$file_git_log.xz"
fi
xz "$file_git_log" # compress log
$log_command gc --aggressive --prune # compress repo
return
}
function magnet() {
if [ ! -f "$(dir_find vault)/*.magnet" ]
then
echo No magnet files found
exit 0
fi
mag2tor_script_path="$(dir_find config)/magnet2torrent/Magnet_To_Torrent2.py"
if [ ! -f "$mag2tor_script_path" ]
then
echo script not found, downloading
git clone "https://github.com/danfolkes/Magnet2Torrent.git" "$(dir_find config)/magnet2torrent"
fi
sshfs_mount
cd "$(dir_find vault)" || exit
for i in *.magnet
do
magnet_source="$(cat "$i")"
python "$mag2tor_script_path" -m "$magnet_source" -o "$(dir_find downloads)/remote/watch/"
rm "$i"
done
return
}
function payslip() {
# depends on: getmail4 mpack qpdf
directory_temp="$(mktemp -d)"
include_credentials
cd "$directory_temp" || exit
mkdir {cur,new,tmp}
payslip_config_write
getmail --getmaildir "$directory_temp"
cd new || exit
grep "$payslip_sender" ./* | cut -f1 -d: | uniq | xargs munpack -f
mv "*.pdf" "$(dir_find paperwork)/"
payslip_decrypt
rm -r "$directory_temp"
return
}
function permissions() {
check_running_as_root
chown "$username":"$username" "$directory_script/rclone.conf"
return
}
function rclone_mount() {
echo rclone mount checker
for i in backups media paperwork pictures saves
do
mount_point="$directory_home/$i"
if [[ -f "$mount_point/.mountcheck" ]]
then
echo "$i" still mounted
else
echo "$i" not mounted
echo force unmounting
fusermount -uz "$mount_point"
echo sleeping
sleep 5
echo mounting
$rclone_command mount "drive-$i": "/home/peter/$i" --vfs-cache-mode minimal --allow-other --allow-non-empty --daemon --log-file "$(dir-find config)/logs/rclone-$i.log" # --allow-other requires user_allow_other in /etc/fuse.conf
echo restarting docker containers
for j in "${docker_restart[#]}"
do
docker restart "$j"
done
fi
done
return
}
function sshfs_mount() {
include_credentials
echo sshfs mount checker
seedbox_host="$seedbox_username.seedbox.io"
seedbox_mount="$(dir_find downloads)/remote"
if [[ -d "$seedbox_mount/files" ]]
then
echo "sshfs mount exists"
else
echo "sshfs mount missing, mounting"
printf "%s" "$seedbox_password" | sshfs "$seedbox_username#$seedbox_host":/ "$seedbox_mount" -o password_stdin -o allow_other
fi
return
}
function status() {
status_filename=$(dir_find blog)/status.md
status_timestamp="$(date +%Y-%m-%d) at $(date +%H:%M)"
status_uptime=$(( $(cut -f1 -d. </proc/uptime) / 86400 ))
status_cpuavgs=$(cut -d" " -f1-3 < /proc/loadavg)
status_users=$(uptime | grep -oP '.{3}user' | sed 's/\user//g' | xargs)
status_ram=$(printf "%.0f" "$(free | awk '/Mem/ {print $3/$2 * 100.0}')")
status_swap=$(printf "%.0f" "$(free | awk '/Swap/ {print $3/$2 * 100.0}')")
status_rootuse=$(df / | awk 'END{print $5}')
status_dluse=$(df | awk '/downloads/ {print $5}')
status_dockers=$(docker ps -q | wc -l)/$(docker ps -aq | wc -l)
status_packages=$(dpkg -l | grep ^ii -c)
status_ifdata=$(vnstat -i eth0 -m --oneline | cut -f11 -d\;)
{
printf -- "---\\nlayout: page\\ntitle: Server Status\\ndescription: A (hopefully) recently generated server status page\\npermalink: /status/\\n---\\n\\n"
printf "*Generated on %s*\\n\\n" "$status_timestamp"
printf "* Uptime: %s" "$status_uptime"
printf " Day%s\\n" "$(plural "$status_uptime")"
printf "* CPU Load: %s\\n" "$status_cpuavgs"
printf "* Users: %s\\n" "$status_users"
printf "* RAM Usage: %s%%\\n" "$status_ram"
printf "* Swap Usage: %s%%\\n" "$status_swap"
printf "* Root Usage: %s\\n" "$status_rootuse"
printf "* Downloads Usage: %s\\n" "$status_dluse"
printf "* [Dockers](https://github.com/breadcat/Dockerfiles): %s\\n" "$status_dockers"
printf "* Packages: %s\\n" "$status_packages"
printf "* Monthly Data: %s\\n\\n" "$status_ifdata"
printf "Hardware specifications themselves are covered on the [hardware page](/hardware/#server).\\n"
} > "$status_filename"
return
}
function sync() {
source=$(rclone_remote gdrive | sed 1q)
dest=$(rclone_remote gdrive | sed -n 2p)
echo Syncing "$source" to "$dest"
$rclone_command sync "$source" "$dest" --drive-server-side-across-configs --verbose --log-file "$(dir_find config)/logs/rclone-sync-$(date +%Y-%m-%d-%H%M).log"
return
}
function update() {
check_running_as_root
if [[ $distro =~ "Debian" ]]
then
update-debian
elif [[ $distro =~ "Arch" ]]
then
update-arch
else
echo "Who knows what you're running"
fi
update-remaining
return
}
main "$#"
I believe you have a namespace problem.
You define a docker() function that does all strange things.
Then inside docker() you call $(docker network ls), that just calls the same function recursively, or inside status you call $(docker ps -aq | wc -l).
There is only one namespace - after you define a function named docker docker() {} anywhere you call $(docker) it will call that function.
You can use command, ex. echo() { printf "I AM NOT ECHO\n"; }; echo 123; command echo 123 - the first echo 123 will execute the function if it exists, the second one will however try to find echo executable in PATH and execute it.
However I better suggest to just use a unique namespace that will not interfere with anything. Declaring your functions docker hides the real command.
blabla_status() {} # instead of status()
blabla_docker() {} # instead of docker
# etc..
# then later in main()
case "$1" in
docker|status) blabla_"$1"; ;;
*) echo "Unknown function" >&2; ;;
esac

bash script loop to check if variable contains string - not working

i have a script which copy files from one s3 bucket to local server, do some stuff and upload it to another s3 bucket.
in the original bucket i have few folders, one of them called "OTHER"
i dot want my script to work on this folder
i tried to define a loop to check if the path string does not contains the string "OTHER" only then to continue to other commands but for some reason it is not working.
what am i doing wrong ?
#!/bin/bash
shopt -s extglob
gcs3='s3://gc-reporting-pud-production/splunk_printer_log_files/'
gcs3ls=$((aws s3 ls 's3://gc-reporting-pud-production/splunk_printer_log_files/' --recursive) | sed 's/^.*\(splunk_printer.*\)/\1/g'| tr -s ' ' | tr ' ' '_')
ssyss3=s3://ssyssplunk
tokenFile=/splunkData/GCLogs/tokenFile.txt
nextToken=$((aws s3api list-objects-v2 --bucket "gc-reporting-pud-production" --prefix splunk_printer_log_files/ --max-items 5) |grep -o 'NEXTTOKEN.*' |awk -F " " '{print $2}')
newToken=$( tail -n 1 /splunkData/GCLogs/tokenFile.txt )
waterMark=$(aws s3api list-objects-v2 --bucket "gc-reporting-pud-production" --prefix splunk_printer_log_files/ --max-items 5 --starting-token
$newToken|sed 's/^.*\(splunk_printer.*zip\).*$/\1/'|sed '1d'|sed '$d')
while true; do
for j in $waterMark ; do
echo $j
if [ "$j" != *"OTHER"* ]; then
gcRegion=$(echo $j | awk -F'/' '{print $2}')
echo "gcRegion:"$gcRegion
if [ "$gcRegion" != "OTHER" ]; then
gcTech=$(echo $j | awk -F'/' '{print $3}')
echo "GCTech:"$gcTech
gcPrinterFamily=$(echo $j | awk -F'/' '{print $4}')
echo "gcPrinterFamily:" $gcPrinterFamily
gcPrinterType=$(echo $j | awk -F'/' '{print $5}')
echo "gcPrinterType:" $gcPrinterType
gcPrinterName=$(echo $j| awk -F'/' '{print $6}')
echo "gcPrinterName:" $gcPrinterName
gcFileName=$(echo $j| awk -F'/' '{print $7}'| awk -F'.zip' '{print $1}')
echo "gcFileName:" $gcFileName
cd /splunkData/GCLogs
dir="/splunkData/GCLogs/$gcRegion/$gcTech/$gcPrinterFamily/$gcPrinterType/$gcPrinterName"
echo "dir:"$dir
mkdir -p $dir
aws s3 sync $gcs3$gcRegion/$gcTech/$gcPrinterFamily/$gcPrinterType/$gcPrinterName/ $dir
find $dir -name '*.zip' -exec sh -c 'unzip -o -d "${0%.*}" "$0"' '{}' ';'
aws s3 cp $dir $ssyss3/$gcRegion/$gcTech/$gcPrinterFamily/$gcPrinterType/$gcPrinterName/ --recursive --exclude "*.zip"
newToken=$( tail -n 1 /splunkData/GCLogs/tokenFile.txt )
nextToken=$(aws s3api list-objects-v2 --bucket "gc-reporting-pud-production" --prefix splunk_printer_log_files/ --max-items 5 --starting-token $newToken |grep -o 'NEXTTOKEN.*' |awk -F " " '{print $2}')
waterMark=$(aws s3api list-objects-v2 --bucket "gc-reporting-pud-production" --prefix splunk_printer_log_files/ --max-items 5 --starting-token $newToken|sed 's/^.*\(splunk_printer.*zip\).*$/\1/'|sed '1d'|sed '$d')
echo "$nextToken" > "$tokenFile"
fi
fi
done
done
You need to use the double-bracket conditional command to turn == and != into pattern matching operators:
if [[ "$j" != *"OTHER"* ]]; then
# ^^ ^^
Or use case
case "$j" in
*OTHER*) ... ;;
*) echo "this is like an `else` block" ;;
esac
Paste your code into https://www.shellcheck.net/ for other things to fix.
I think glenn jackman was on the right path. Try this:
if [[ "$j" != *OTHER* ]]; then
The [[ ]] is required for pattern string matching (and you have to remove the " ). The case statement is also a good idea. You can abandon the shell test altogether and use grep as follows:
if
grep -q '.*OTHER.*' <<< "$j" 2>/dev/null
then
...
fi
Here's a check of the [[ ]]:
$ echo $j
abOTHERc
$ [[ "$j" == *OTHER* ]]
$ echo $?
0
As per BenjaminW., the quotes around $j in [[ ]] are unnecessary. However, the quotes around *OTHER* do make a big difference. See below:
$ j="OTHER THINGS"
$ [[ $j == "*OTHER*" ]] ; echo "$j" matches '"*OTHER*"': $?
OTHER THINGS matches "*OTHER*": 1
$ [[ $j == *OTHER* ]] ; echo "$j" matches '*OTHER*': $?
OTHER THINGS matches *OTHER*: 0

Unix Bash - Copy files from a source folder recursively to destination/*file_extension*(ex. “txt”) folder

This is my code, something in the rec_copy() function isn't working properly, probably this line:
cp $1/$f $HOME/$2/$dest
The extension named folders are created in the destination folder but the files are not copied there. Can you help me?
#!/bin/bash
if [ $# -ne 2 ]
then
echo "Usage: $0 <source> <destination>"
exit
fi
if [ ! -d $1 ]
then
echo "Source folder does not exist"
exit
fi
if [ -d $2 ]
then
rm -r $2
mkdir $2
else
mkdir $2
fi
extension=`ls -l $1 | grep -v "^d" | awk '{ print $10; }' | sed 's/^.*\.//g'`
for f in $extension
do
if [ ! -d $1/$f ]
then
mkdir $2/$f
fi
done
rec_copy(){
folder=`ls $1`
for f in $folder
do
dest=`echo "$f" | sed 's/.*\.//g'`
if [ -f $1/$f ]
then
cp $1/$f $HOME/$2/$dest
elif [ -d $1/$f ]
then
rec_copy $1/$f
fi
done
}
rec_copy $1
Here is the answer in case someone ever needs it:
#!/bin/bash
if [ $# -ne 2 ]
then
echo "Usage: $0 <izvor> <destinacija>"
exit
fi
if [ ! -d "$1" ]
then
echo "Izvorniot folder ne postoi"
exit
fi
if [ -d "$2" ]
then
rm -r "$2"
mkdir "$2"
else
mkdir "$2"
fi
extension=`ls -l "$1" | grep -v "^d" | awk '{ print $10; }' | sed 's/^.*\.//g' | sort -u`
for f in $extension
do
if [ ! -d "$1/$f" ]
then
mkdir "$2/$f"
fi
done
rec_copy(){
folder=`ls "$1"`
for f in $folder
do
dest=`echo "$f" | sed 's/.*\.//g'`
to=`cp "$1/$f" "$2/$dest"`
if [ -f "$1/$f" ]
then
echo "$to"
elif [ -d "$1/$f" ]
then
rec_copy "$1/$f" "$2"
fi
done
}
rec_copy "./$1" "./$2"

How to pipe aws s3 cp to gzip to be used with "$QUERY" | psql utility

I have following command
"$QUERY" | psql -h $DB_HOST -p $DB_PORT -U $DB_USERNAME $DB_NAME
Where $QUERY is a command that loads files from a bucket, unzip it, and put to the database. It looks like following:
COPY my_table
FROM PROGRAM 'readarray -t files <<<"$(aws s3 ls ${BUCKET_PATH} | tr [:space:] "\n")"; for (( n = ${#files[#]} - 1; n >= 0; n--)); do if [[ ${files[$n]} =~ .csv.gz$ ]]; then aws s3 cp ${BUCKET_PATH}${files[$n]} >(gzip -d -c); break; fi done'
WITH DELIMITER ',' CSV
Here is formatted bash code:
#!/usr/bin/env bash
raw_files=`aws s3 ls ${BUCKET_PATH} | tr [:space:] "\n"`
readarray -t files <<<"$raw_files"
for (( n = ${#files[#]} - 1; n >= 0; n--)); do
if [[ ${files[$n]} =~ .csv.gz$ ]];
then aws s3 cp ${BUCKET_PATH}${files[$n]} >(gzip -d -c);
break; # for test purposes to be no load all files, jsut one
fi
done
aws-CLI version
#: aws --version
#: aws-cli/1.11.13 Python/3.5.2 Linux/4.13.0-43-generic botocore/1.4.70
This script works. But when I try to use it with psql, it fails, and I cannot understand why.
How can I fix it?
Here is a script that loads data from s3 bucket and merges it to fat file:
#!/usr/bin/env bash
bucket_path=$1
limit_files=$2
target_file_name=$3
echo "Source bucket $bucket_path"
if [ -z $target_file_name ]; then
target_file_name="fat.csv.gz"
echo "Default target file $target_file_name"
fi
echo "Total files $(aws s3 ls $bucket_path | wc -l)"
readarray -t files <<<"$(aws s3 ls $bucket_path | tr [:space:] "\n")"
for (( n = ${#files[#]} - 1, i=1; n >= 0; n--)); do
if [[ ${files[$n]} =~ .csv.gz$ ]]; then
aws s3 cp --quiet $bucket_path${files[$n]} >(cat >> "$target_file_name");
echo "$((i++)), ${files[$n]}, current size: $(du -sh $target_file_name)"
if [ ! -z $limit_files ] && [ $i -gt $limit_files ]; then
echo "Final size $(du -sh $target_file_name)"
exit 0
fi
fi
done
exit 0
It works correctly.
But when I try pipe this fat.csv.gz to psql db using the following code
echo "COPY my_table
FROM PROGRAM 'gzip -d -c fat.csv.gz'
WITH DELIMITER ',' CSV" | psql -h $DB_HOST -p $DB_PORT -U $DB_USERNAME $DB_NAME
I am getting the error:
ERROR: must be superuser to COPY to or from a file
It looks like a specific of working of pg (I guess it's due to security reasons) - link
So, the problem now that I don't know how to rework my script to be pipe the fat.csv.gz. I cannot get such privilege and should find a workaround.
I finally wrote the following bash script downloads files from s3, merge them to 50MB archives and pipe to pg in a sub process. Hope it will be helpful for somebody:
get_current_timestamp() (
date '+%s.%N'
)
execute_sql() (
write_log "Importing data from s3 to pg..."
import_data_from_s3 "$EVENTS_PATH"
write_log "Importing data from s3 to pg...done"
)
columns() (
local columns=`echo "SELECT array_to_string(
array(SELECT column_name::text
FROM information_schema.columns
WHERE table_name ILIKE '${TMP_TABLE}'
AND column_name NOT ILIKE '${DATE_FIELD}'), ',')" | \
psql --tuples-only -h $DB_HOST -p $DB_PORT -U $DB_USERNAME $DB_NAME`
echo -n "${columns}"
)
get_timestamp_difference() (
FROM=$1
TO=$2
echo $FROM $TO | awk '{
diff = $2-$1
if (diff >= 86400) {
printf "%i days ", diff/86400
}
if (diff >= 3600) {
printf "%i hours ", (diff/3600)%24
}
if (diff >= 60) {
printf "%i mins ", (diff/60)%60
}
printf "%f secs", diff%60
}'
)
pretty_size() (
if [ ! -z $1 ]; then
local size=$1;
else
local size=`cat <&0`;
fi
echo "${size}" | \
awk '{ \
split( "B KB MB GB" , v ); \
s=1; \
while( $1>=1024 ) { \
$1/=1024; s++ \
} \
printf "%.1f%s", $1, v[s] \
}' | \
add_missing_eol >&1
)
import_data_from_s3() (
local bucket_path=$1
local limit_files=$2
local target_file_name=$3
write_log "Source bucket $bucket_path"
if [ -z ${target_file_name} ]; then
target_file_name="fat.csv.gz"
write_log "Default target file $target_file_name"
fi
if [ ! -z ${limit_files} ]; then
write_log "Import ${limit_files} files"
else
write_log "Import all files"
fi
write_log "Total files $(aws s3 ls $bucket_path | wc -l)"
readarray -t files <<<"$(aws s3 ls $bucket_path | tr [:space:] "\n")"
write_log "Remove old data files..."
find . -maxdepth 1 -type f -name "*${target_file_name}" -execdir rm -f {} +;
write_log "Remove old data files...done"
TMP_TABLE_COLUMNS=$(columns)
write_log "Importing columns: ${DW_EVENTS_TMP_TABLE_COLUMNS}"
declare -A pids
local total_data_amount=0
local file_size_bytes=0
local file_size_bytes=0
local size_limit=$((50*1024*1024))
for (( n = ${#files[#]} - 1, file_counter=1, fat_file_counter=1; n >= 0; n--)); do
if [[ ! ${files[$n]} =~ .csv.gz$ ]]; then continue; fi
file="${fat_file_counter}-${target_file_name}"
aws s3 cp --quiet ${bucket_path}${files[$n]} >(cat >> "${file}");
file_size_bytes=$(stat -c%s "$file")
if [ $file_size_bytes -gt $size_limit ]; then
import_zip "${file}" "$(pretty_size ${file_size_bytes})" & pids["${file}"]=$!;
total_data_amount=$((total_data_amount+file_size_bytes))
write_log "Files read: ${file_counter}, total size(zipped): $(pretty_size ${total_data_amount})"
((fat_file_counter++))
fi
# write_log "${file_counter}, ${files[$n]}, current size: $(du -sh $file)"
if [ ! -z ${limit_files} ] && [ ${file_counter} -gt ${limit_files} ]; then
write_log "Final size $(du -sh ${file})"
if [ ! ${pids["${file}"]+0} ]; then
import_zip "${file}" "$(pretty_size ${file_size_bytes})" & pids["${file}"]=$!;
fi
break;
fi
((file_counter++))
done
# import rest file that can less than limit size
if [ ! ${pids["${file}"]+0} ]; then
import_zip "${file}" "$(pretty_size ${file_size_bytes})" & pids["${file}"]=$!;
fi
write_log "Waiting for all pids: ${pids[*]}"
for pid in ${pids[*]}; do
wait $pid
done
write_log "All sub process have finished. Total size(zipped): $(pretty_size ${total_data_amount})"
)
import_zip() (
local file=$1
local size=$2
local start_time=`get_current_timestamp`
write_log "pid: $!, size: ${size}, importing ${file}...";
gzip -d -c ${file} | \
psql --quiet -h ${DB_HOST} -p ${DB_PORT} -U ${DB_USERNAME} ${DB_NAME} \
-c "COPY ${TMP_TABLE}(${TMP_TABLE_COLUMNS})
FROM STDIN
WITH DELIMITER ',' CSV";
rm $file;
local end_time=`get_current_timestamp`
write_log "pid: $!, time: `get_timestamp_difference ${start_time} ${end_time}`, size: ${size}, importing ${file}...done";
)

rake - running shell command returns error

I am trying to run in Rake the following shell command:
sh "d='jps -l | grep jar | cut -d ' ' -f 1'; if [ -z \"$d\" ]; then :; else kill \"$d\"; fi;"
However I get:
sh: -f 1: not found
If I run it in linux shell it works fine.
What is wrong?
I interpreted your question wrong earlier. This is what you want.
d='jps -l | grep jar | cut -d " " -f 1; if [ -z "$d" ]; then :; else kill "$d"; fi;'
system(d)
OR
If you want output of the command (which I guess you don't in this case)
output = `jps -l | grep jar | cut -d " " -f 1; if [ -z "$d" ]; then :; else kill "$d"; fi;`
You need to escape your single quotes and quote the whole string:
d='jps -l | grep jar | cut -d \' \' -f 1; if [ -z "$d" ]; then :; else kill "$d"; fi;'

Resources