1

I have some kind of a synchronization script transferring files from a production server to a DR one every day via rsync.

however, since a few days the script seems to be crashing.

they were multiple instances of it running, so I killed them all and restarted the process, however, it doesn't seem to do anything.

when tracing the PID with strace, I can see it re-reading the same data again and again.

the commands ran by the wrapper are :

root     4052277 30.5  0.1  61540 19280 pts/0    S+   11:00   8:32 /opt/csm/64-bit/deps/1.0.6/bin/rsync --archive --compress --recursive --rsh=/usr/bin/ssh -i /root/.ssh/id_rsa_drsync --rsync-path=/usr/bin/rsync --verbose /app/PWC/ xm3p7050vmo://app/PWC
root     4052278  0.0  0.0  52184  6096 pts/0    S+   11:00   0:00 /usr/bin/ssh -i /root/.ssh/id_rsa_drsync xm3p7050vmo /usr/bin/rsync --server -vlogDtprze.iLsf . //app/PWC

when doing an strace -p4052277, I get :

read(3, ",,N,,N,07/09/2017 04:00:17.000000000,D,pc_exe,D,07/09/2017 04:00:17.000000000,D,pc_exe,D,BASTITLE_T,D,wf_DWH_Global,D,s_"..., 262144) = 262144
read(3, ",N,,N,,N,,N,,N,,N,,N,,N,,N,,N,0,D,0,D,,N,,N,1,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N"..., 262144) = 262144
read(3, ",,N,,N,,N,,N,,N,,N,0,D,0,D,,N,,N,1,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,-1,D,,N"..., 262144) = 262144
read(3, ",D,BASTITLE_T,D,wf_DWH_Global,D,s_DWH_T_BASE_TITLE,D,19141,D\n0,D,22529353,D,88380,D,4,D,0174525.001.000.001,D,MC58119990"..., 262144) = 262144
read(3, " 19:54:21.000000000,D,101,D,,N,,N,0,D,0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,07/09/2017"..., 262144) = 262144
read(3, ",,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,0,D,0,D,,N,,N,1,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,"..., 262144) = 262144
read(3, ",N,,N,,N,0,D,0,D,,N,,N,,N,,N,,N,10/29/2016 19:54:16.000000000,D,101,D,,N,,N,0,D,0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,"..., 262144) = 262144
read(3, ",N/A,D,N/A,D,N/A,D,N/A,D,2201,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,-1,D,0,D,1,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N"..., 262144) = 262144
read(3, "7 04:00:17.000000000,D,pc_exe,D,07/09/2017 04:00:17.000000000,D,pc_exe,D,BASTITLE_T,D,wf_DWH_Global,D,s_DWH_T_BASE_TITLE"..., 262144) = 262144
read(3, ",D,86550,D,6,D,FX.FORW-0419518-L1-GBP,D,,N,,N,,N,,N,FX.FORW-0419518-L1-GBP,D,,N,,N,,N,,N,999,D,GBP,D,2108,D,999,D,N/A,D,"..., 262144) = 262144
read(3, "-Update: 07.07.2017 05:20:41,D,,N,,N,,N,,N,,N,,N,,N,30100,D,30107,D,30108,D,,N,,N,10/29/2016 00:00:00.000000000,D,0,D,,N"..., 262144) = 262144
read(3, ",,N,,N,,N,,N,,N,-1,D,,N,,N,,N,,N,RTW-Update: 07.07.2017 05:21:17,D,,N,,N,,N,,N,,N,0,D,,N,30100,D,30111,D,30112,D,,N,,N,1"..., 262144) = 262144
read(3, ",,N,,N,10/29/2016 19:54:13.000000000,D,101,D,,N,,N,0,D,0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N"..., 262144) = 262144
read(3, ",2105,D,999,D,N/A,D,100,D,CPT-001,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,2201,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,-1,D,0,D,1,"..., 262144) = 262144
read(3, "0,D,,N,30100,D,30101,D,30102,D,,N,,N,10/29/2016 00:00:00.000000000,D,0,D,,N,0,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,0,D,0,D,,"..., 262144) = 262144
read(3, ",,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,RTW-Update: 07.07.2017 05:21:05,D,,N,,N,,N,,N,,N,,N,,N,30100,D,30107,D,30108,D,,N,,N,10/"..., 262144) = 262144
read(3, "7.000000000,D,101,D,,N,,N,0,D,0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,07/09/2017 04:00:1"..., 262144) = 262144
read(3, "00000,D,pc_exe,D,07/09/2017 04:00:17.000000000,D,pc_exe,D,BASTITLE_T,D,wf_DWH_Global,D,s_DWH_T_BASE_TITLE,D,19141,D\n0,D,"..., 262144) = 262144
read(3, "D,N/A,D,N/A,D,N/A,D,N/A,D,0,D,0,D,1,D,0,D,,N,03/14/2016 00:00:00.000000000,D,03/14/2018 00:00:00.000000000,D,,N,,N,,N,,N"..., 262144) = 262144
read(3, ":17.000000000,D,pc_exe,D,BASTITLE_T,D,wf_DWH_Global,D,s_DWH_T_BASE_TITLE,D,19141,D\n0,D,22534150,D,83337,D,6,D,FX.FORW-03"..., 262144) = 262144
read(3, ",N,,N,,N,,N,RTW-Update: 07.07.2017 05:20:51,D,,N,,N,,N,,N,,N,,N,,N,30100,D,30107,D,30108,D,,N,,N,10/29/2016 00:00:00.000"..., 262144) = 262144
read(3, "00,D,101,D,,N,,N,0,D,0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,07/09/2017 04:00:17.0000000"..., 262144) = 262144
read(3, "N,,N,,N,,N,10/29/2016 19:54:00.000000000,D,101,D,,N,,N,0,D,0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,"..., 262144) = 262144
read(3, ",22535333,D,82141,D,6,D,FX.FORW-0373675-L2-CHF,D,,N,,N,,N,,N,FX.FORW-0373675-L2-CHF,D,,N,,N,,N,,N,999,D,CHF,D,2108,D,999"..., 262144) = 262144
read(3, "N/A,D,N/A,D,N/A,D,-1,D,0,D,1,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,0,D,0,D,,N,,N,1,D,0,D,,N,,N,,N,"..., 262144) = 262144
read(3, "4795-L1-USD,D,,N,,N,,N,,N,999,D,USD,D,2108,D,999,D,N/A,D,100,D,CHG-210,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,2204,D,N/A,D,N/A,"..., 262144) = 262144
read(3, "BASE_TITLE,D,19141,D\n0,D,22536220,D,82668,D,6,D,FX.FORW-0385515-L1-USD,D,,N,,N,,N,,N,FX.FORW-0385515-L1-USD,D,,N,,N,,N,,"..., 262144) = 262144
read(3, "N,,N,,N,,N,,N,0,D,0,D,,N,,N,,N,,N,,N,10/29/2016 19:53:33.000000000,D,101,D,,N,,N,0,D,0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N"..., 262144) = 262144
read(3, "2017 05:20:26,D,,N,,N,,N,,N,,N,0,D,,N,30100,D,30101,D,30102,D,,N,,N,10/29/2016 00:00:00.000000000,D,0,D,,N,0,D,0,D,,N,,N"..., 262144) = 262144
read(3, ",N,,N,,N,FX.FORW-0387703-L2-CHF,D,,N,,N,,N,,N,999,D,CHF,D,2108,D,999,D,N/A,D,100,D,CHG-210,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A"..., 262144) = 262144
read(3, "obal,D,s_DWH_T_BASE_TITLE,D,19141,D\n0,D,22537407,D,80668,D,4,D,0177715.001.000.756,D,MC5811999000010177715000426,D,,N,,N"..., 262144) = 262144
read(3, ",,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,RTW-Update: 07.07.2017 05:20:05,D,,N,,N,,N,,N,,N,"..., 262144) = 262144
read(3, "N,,N,,N,,N,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,07/09/2017 04:00:17.000000000,D,pc_exe,D,07/09/2017 04:00:17.00"..., 262144) = 262144
read(3, "10/29/2016 00:00:00.000000000,D,0,D,,N,0,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,0,D,0,D,,N,,N,,N,,N,,N,10/29/2016 19:53:33.000"..., 262144) = 262144
read(3, "/A,D,N/A,D,N/A,D,2201,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,N/A,D,-1,D,0,D,1,D,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,"..., 262144) = 262144
read(3, "1,D,,N,,N,,N,,N,RTW-Update: 07.07.2017 05:20:03,D,,N,,N,,N,,N,,N,0,D,,N,30100,D,30101,D,30102,D,,N,,N,10/29/2016 00:00:0"..., 262144) = 262144
read(3, "0,D,0,D,,N,,N,,N,,N,,N,,N,0,D,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,,N,07/09/2017 04:00:17.000000000,D,pc_exe,D,07/09/2"..., 262144) = 262144
read(3, "9141,D\n0,D,22539492,D,77738,D,4,D,0171560.001.000.001,D,MC5811999000010171560000159,D,,N,,N,,N,0171560.001.000.001,D,,N,"..., 262144) = 262144

just flooding this same block again and again (the FS is about 150Gbytes)

how can I investigate on why it's doing this ? (it was perfectly working before) I can't reboot the machine as it's in prod.

could it be related to a bad link or anything in the folder ? or a file being in use ? (apparently none are in use, looking at the processes running), is there additional RSYNC options I could add to fix that ?

thanks for your hints regards,

olivierg
  • 494
  • 1
  • 6
  • 24

2 Answers2

1

I would try to first verify if rsync is working properly. Create a small test directory with a few files in source and then sync just that with the destination to see if it works. Also add --stats --verbose option.

I am guessing this might be network related. Can you verify and see if there are any network issues?

Tux_DEV_NULL
  • 1,083
  • 7
  • 11
  • thanks for the tip, I'm having some trouble with ssh keys it seems. surprisingly, when I run "ssh -i /root/.ssh/id_rsa_drsync xm3p7050vmo ls" it works, but "/opt/csm/64-bit/deps/1.0.6/bin/rsync --archive --compress --recursive --rsh=/usr/bin/ssh -i /root/.ssh/id_rsa_drsync --rsync-path=/usr/bin/rsync --verbose /app/PWC/ xm3p7050vmo://app/PW" prompts for the key, like if the -I option wasn't used – olivierg Sep 13 '17 at 10:24
  • after some time, in strace, I get : "select(5, NULL, [4], [4], {12, 901231} " as if the command was waiting for something (but nothing in stdout), and then "0 (Timeout)". I will check the network too but so far everything looks good on this side (no packet loss or anything wrong) – olivierg Sep 13 '17 at 11:25
0

it looks like my filesystem grows to 100% while doing the rsync and that's probably the reason why it fails !

I will see to extend it and rerun the process again (I also had an ssh key error)

olivierg
  • 494
  • 1
  • 6
  • 24
  • Glad to know you figured out. Have a check for disk consumption and also when you run rsync use a file lock so it is not running more than once. – Tux_DEV_NULL Sep 13 '17 at 13:24