/*
*********************************************************************************************************************************
	                              COMPUTATION OF COEFFICIENT OF INDIVIDUAL AGREEMENT (CIA)					
*********************************************************************************************************************************
AUTHOR INFORMATION:

Authors:	Yi Pan, Jingjing Gao, Michael Haber and Huiman X. Barnhart
Emails:		ypan5@sph.emory.edu, jgao@emory.edu, mhaber@emory.edu, huiman.barnhart@duke.edu
Last revision: Sep. 4 2009

=================================================================================================================================
PROGRAM DESCRIPTION:

The macro provides estimates of the Coefficient of Individual Agreement (CIA) for comparing two methods/observers, X and Y, at a time.
The standard errors of CIAs along with (1-alpha)*100% confidence intervals for CIAs are also provided.   
The mean squared deviation (MSD) is used as the disagreement function.
Tis macro can be applied for binary data or continuous data, but not for categorical data with more than two levels.

Two cases are considered:
(1) When neither of the observers is considered a reference, the the CIA, denoted as psi_N, is defined as the average of the mean 
	within-observer disagreements divided by the mean between-observer disagreement.
(2) When one of the observers can be treated as the golden standard, then the CIA, denoted as psi_R, is defined as the ratio of 
	mean within-observer disagreement and mean between-observer disagreement.

WARNINGS: When the number of subjetcs for calculating Psi_N or Psi_R is less than 10, a WARNING message will be shown in the LOG file.

In order to claim an `acceptable' agreement, the coefficient should be at least 0.8.

CIAs require that at least two readings are conducted for each observer if neither of them is considered as the reference;
at least two readings are required for the reference observer if this observer is treated as the gold standard.

=================================================================================================================================
MACRO VARIABLES:

id: 			the subjects' identifier
data_name:		name of the data set used (should include the library name if not in the default library "WORK")
measurement:	name of the variable containing the outcome observations (binary or continuous outcomes)
method:			name of the variable indicating observers/methods
observer1:		name of the first observer, must exactly match the name as in the dataset (case sensitive).
                If one of the observers is considered as the reference, it should be assigned as observer1
observer2: 		name of the first observer, must exactly match the name as in the dataset (case sensitive)
alpha: 			type I error rate. (1-alpha)*100% confidence intervals will be provided in the output (default = 0.05)
title:  		title of the final output which is designed to help the user to mark the name of the current comparison if 
				multiple comparisons are involved

=================================================================================================================================
DATASET REQUIREMENTS:

The input dataset should take the format as the subjects' identifying number (id) in one column; 
methods/observers used in one column (the same observer should have the same value (case sensitive), 
		e.g. if the first observer is called "x", then "X" would be treated as a different observer. 
		it is the user's responsibility to check it, the macro would not check this);
observations/measurements obtained in one column (binary or continuous outcomes).

We allow the number of replicated measurements to be difference across subjects and observers. 
Missing values are allowed and should be indicated as "."

Example of the data (binary):

              id	Method	Measurement
               1	  x	      0
               1	  x       0
               1	  x	      0
               1	  y	      0
               1	  y	      0
               1	  y  	  .
               2	  x	      0
               2	  x	      0
               2	  x	      0
               2	  y	      0
               2	  y  	  1
               2	  y	      .  

Example of the data (continuous):
          
		   	  id	Method  Measurement
               1	  x	    100
               1	  x     106
               1	  x	    107
               1	  y	    122
               1	  y	    128
               1	  y  	124
               2	  x	    108
               2	  x	    110
               2	  x	    108
               2	  y	    121
               2	  y  	127
               2	  y	    128  

---------------------------------------------------------------------------------------------------------------------------------
If the original data takes the "wide" format (see below for an example), we'd recommend to run the following codes to transform 
the dataset to "long" format.

Example of the data in "wide" format: 
   Obs    id    obs11    obs12    obs13    obs21    obs22    obs23   
     1     1     100      106      107       98       98      111    
     2     2     108      110      108      108      112      110    
	 ...

************************************
Sample SAS Codes for Transformation
************************************;

data wide;
	input Obs id obs11 obs12 obs13 obs21 obs22 obs23;
cards;
1 1 100 106 107 98 98 111    
2 2 108 110 108 108 112 110  
; 

data one(keep=id method rep measurement); 
	set wide;
	retain id method rep measurement;
	method="x";
	array x(1:3) obs11-obs13;
    do i=1 to 3;
               measurement=x(i);
               rep=i;
               output;
    end;
run;

data two(keep=id method rep measurement); 
	set wide;
	retain id method rep measurement;
	method="y";
	array y(1:3) obs21-obs23;
    do i=1 to 3;
               measurement=y(i);
               rep=i;
               output;
    end;
run;

data long;
	merge one two;
	by id method;
	drop rep;
run;

proc print data=long;
run;

************************************
End of SAS Codes for Transformation
************************************;

Example of the data in "long" format after transformation (same as the one shown as the continuous example):
     
              id	Method  Measurement
               1	  x	    100
               1	  x     106
               1	  x	    107
               1	  y	    122
               1	  y	    128
               1	  y  	124
               2	  x	    108
               2	  x	    110
               2	  x	    108
               2	  y	    121
               2	  y  	127
               2	  y	    128  
=================================================================================================================================
OUTPUT:

The following values will be provided in the output:
1. MSD(X,X')  MSD(Y,Y')  MSD(X,Y)
2. Estimate of psi_N,   SE of psi_N,   (1-alpha)*100%CI of psi_N (based on normality)
3. Estimate of psi_R,   SE of psi_R,   (1-alpha)*100%CI of psi_R (based on normality)

=================================================================================================================================
EXAMPLE CODE:

Example of a program that calls the macro:

In this example we run the macro using the 'bland_sbp' data.
There are 3 observers (two human raters and one monitor). Each observer made three replications.  

Suppose that the subject identifier in the original data is the variable 'subject_no'. 
The variable containing the observations is named as 'measurement'.
The variable indicating the observers is named as 'rater'.

In this example we estimate the CIA's for raters x and z (first human rater vs. monitor).

---------------------------------------------------------------------------------------------------------------------------------
Snapshot of the 'bland_sbp' data:
subject_no rater measurement   
	1		 x		100			
	1		 x		106			
	1		 x		107			
	1		 y		 98			
	1		 y		 98			
	1		 y		111			
	1		 z		122			
	1		 z		128			
	1		 z		124			
	2		 x		108			
	2		 x		110			
	2		 x		108			
	2		 y		108			
	2		 y		112			
	2		 y		110		
	2		 z		121			
	2		 z		127			
	2		 z		128
	...

---------------------------------------------------------------------------------------------------------------------------------
*****************
Sample SAS Codes
*****************;
libname dat "D:\CIA\data";  				* Specify the folder where the dataset is stored;

%include 'D:\CIA\program\CIA.sas';	 		* Specify the folder where the SAS Macro is saved;

%CIA(id=subject_no, data_name=dat.bland_sbp, measurement=measurement, method=rater, observer1=x, observer2=z, alpha=0.05, 
	 title=Comparison between observer x and z);

***********************
End of Sample SAS Codes
***********************;

---------------------------------------------------------------------------------------------------------------------------------
If the INPUT DATA is not saved in the "WORK" library, we recommend delete all the datasets in the "WORK" library to avoid confusion.
DO NOT RUN the PROC DATASETS procedure if the INPUT DATA IS located in the "WORK" library. 
*****************
Sample SAS Codes
*****************;
proc datasets library=work nolist kill;		* Delete ALL tempory datasets in WORK library;
run;
quit;

libname dat "D:\CIA\data";  				* Specify the folder where the dataset is stored;

%include 'D:\CIA\program\CIA.sas';	 		* Specify the folder where the SAS Macro is saved;

%CIA(id=subject_no, data_name=dat.bland_sbp, measurement=measurement, method=rater, observer1=x, observer2=z, alpha=0.05, 
	 title=Comparison between observer x and z);

***********************
End of Sample SAS Codes
***********************;
**********************************************************************************************************************************/;


%macro CIA(id=, data_name=, measurement=, method=, observer1=, observer2=, alpha=0.05, title= );
 
OPTIONS FORMDLIM="-";

*=============================================
Transform long data to wide one	for observer1
*=============================================;
data data_x;
	set &data_name;
	if &method ne "&observer1" then delete;
run;

proc transpose data=data_x out=x_out prefix=x;
	by &id;
	var &measurement;
run;

*===============================================
Calculate the maximum replication for observer1
*===============================================;

proc means data=data_x n noprint;
	class &id;
	var &measurement;
	output out=num_x n=num;
run;

data num_x;
	set num_x;
	if &id=. then delete;
	drop _type_ _freq_;
run;

proc means data=num_x max maxdec=0 noprint;
	var num;
	output out=max_x max=rep_X;
run;

data max_x; 
	set max_x;
	*drop _type_ _freq_;
	call symput('rep_X',trim(left(put(rep_X,8.))));
run;

*=============================================
Transform long data to wide one	for observer2
*=============================================;

data data_y;
	set &data_name;
	if &method ne "&observer2" then delete;
run;

proc transpose data=data_y out=y_out prefix=y;
	by &id;
	var &measurement;
run;

data data_xy;
	merge x_out y_out;
	by &id;
run;

*===============================================
Calculate the maximum replication for observer2
*===============================================;

proc means data=data_y n noprint;
	class &id;
	var &measurement;
	output out=num_y n=num;
run;

data num_y;
	set num_y;
	if &id=. then delete;
	drop _type_ _freq_;
run;

proc means data=num_y max maxdec=0 noprint;
	var num;
	output out=max_y max=rep_Y;
run;

data max_y; 
	set max_y;
	drop _type_ _freq_;
	call symput('rep_Y',trim(left(put(rep_Y,8.))));
run;

*===================================
data_xy_n_r is for estimating Psi_N
*===================================;

data data_xy_n_r;
	set data_xy;
	array x(&rep_X) x1-x&rep_X;
	array y(&rep_Y) y1-y&rep_Y;
	n_x=n( of x1-x&rep_X);
	n_y=n( of y1-y&rep_Y);
	if n_x <=1 or n_y<=1 then delete;
	drop _name_;
run;

*==============================================================
if n <= 10, then we want to put a warning in the log for Psi_N
*==============================================================;

proc means data=data_xy_n_r n noprint;
	var id;
	output out=size_xy_n_r n=num_xy_n_r;
run;

data size_xy_n_r;
	set size_xy_n_r;
	call symput('size_n_r',trim(left(put(num_xy_n_r,3.))));
run;

*===================================
data_xy_r is for estimating Psi_R
*===================================;

data data_xy_r;
	set data_xy;
	array x(&rep_X) x1-x&rep_X;
	array y(&rep_Y) y1-y&rep_Y;
	n_x=n( of x1-x&rep_X);
	n_y=n( of y1-y&rep_Y);
	if n_x <=1 or n_y<=0 then delete;
	drop _name_;
run;

proc means data=data_xy_r n noprint;
	var id;
	output out=size_xy_r n=num_xy_r;
run;

*==============================================================
If n <= 10, then we want to put a warning in the log for Psi_R
*==============================================================;

data size_xy_r;
	set size_xy_r;
	call symput('size_r',trim(left(put(num_xy_r,3.))));
run;

*===================================================================================================
* data_xy_n_r is used to calculate Psi_N
* data_xy_r is used to calculate Psi_R
* size_n_r is a global variable containing the available number of subjects in calculation of Psi_N
* size_r is a global variable containing the available number of subjects in calculation of Psi_R
*===================================================================================================;

*==========================================
Estimation of Psi_N: point estimate and SE
*==========================================;

		data est_within_n_r; set data_xy_n_r ;
			array x(&rep_X) x1-x&rep_X;
			array y(&rep_Y) y1-y&rep_Y;
				wmean_x = mean(of x1-x&rep_X);
				wmean_y = mean(of y1-y&rep_Y);
				wstd_x = std(of x1-x&rep_X);
				wstd_y = std(of y1-y&rep_Y);
				wvar_x=(wstd_x)**2;
				wvar_y=(wstd_y)**2;
				sum_diffsq=0;
				do mx=1 to &rep_X;
					do my=1 to &rep_Y;
						if x(mx)~=. and y(my)~=. then sum_diffsq = sum_diffsq + (x(mx)-y(my))**2;
					end;
				end;
				est_wmsd_xx = 2*wvar_x;
				est_wmsd_yy = 2*wvar_y;
				est_wmsd_xy = sum_diffsq/(n_x * n_y);

				keep &id wmean_x wmean_y wvar_x wvar_y est_wmsd_xx est_wmsd_yy est_wmsd_xy ;   
		run;

	ods trace off;
	ods select none;

		proc means mean data=est_within_n_r;
			var  est_wmsd_xx est_wmsd_yy est_wmsd_xy ;
			ods output summary=overall_means_n_r;
		run;

	ods select all;

		data overall_estimates_n_r; set overall_means_n_r;
			est_msd_xx = est_wmsd_xx_Mean;
			est_msd_yy = est_wmsd_yy_Mean;
			est_msd_xy = est_wmsd_xy_Mean;
			keep  est_msd_xx est_msd_yy est_msd_xy; 
		run; 


		data overall_estimates_n_r (rename=(est_msd_xx=MSD_XX est_msd_yy=MSD_YY est_msd_xy=MSD_XY)); 
			set overall_estimates_n_r; 
			if &size_n_r <=10 then do;
         	put 'WARNING: Number of Subjetcs for Calculating Psi_N is less than 10';
			end;
		run;

		proc print data=overall_estimates_n_r noobs;
			var MSD_XX MSD_YY MSD_XY;
			title1 "&title";
			title2 "Estimate of MSD Functions";
		run;


		data simdata_n_r (rename=(est_wmsd_xx=G_X_i est_wmsd_yy=G_Y_i est_wmsd_xy=G_XYXY_i));
			set est_within_n_r;
			keep est_wmsd_xx est_wmsd_yy est_wmsd_xy  ;
		run;

		proc corr data=simdata_n_r cov outp = outp_n_r noprint;
			var G_X_i G_Y_i G_XYXY_i;
		run;

		data var_n_r(keep=Mean_G1 Mean_G2 Mean_G3 Std_G1 Std_G2 Std_G3 N Var_G1 Var_G2 Var_G3  Cov_G12  Cov_G13  Cov_G23);
			merge outp_n_r(rename=(G_X_i = Mean_G1) where=( _TYPE_="MEAN" ))  
			  outp_n_r(rename=(G_Y_i  = Mean_G2) where=( _TYPE_="MEAN" )) 
			  outp_n_r(rename=(G_XYXY_i  = Mean_G3) where=( _TYPE_="MEAN" )) 
			  outp_n_r(rename=(G_X_i = Std_G1) where=( _TYPE_="STD" ))  
			  outp_n_r(rename=(G_Y_i  = Std_G2) where=( _TYPE_="STD" )) 
			  outp_n_r(rename=(G_XYXY_i  = Std_G3) where=( _TYPE_="STD" )) 
			  outp_n_r(rename=(G_X_i = N) where=( _TYPE_="N" )) 

			  outp_n_r(rename=(G_X_i = Var_G1) where=( _TYPE_="COV" and  _NAME_="G_X_i ")) 
	 		  outp_n_r(rename=(G_Y_i  = Var_G2) where=( _TYPE_="COV" and  _NAME_="G_Y_i ")) 
	  		  outp_n_r(rename=(G_XYXY_i  = Var_G3) where=( _TYPE_="COV" and  _NAME_="G_XYXY_i ")) 
	   		  outp_n_r(rename=(G_Y_i  = Cov_G12) where=( _TYPE_="COV" and  _NAME_="G_X_i ")) 
	   		  outp_n_r(rename=(G_XYXY_i  = Cov_G13) where=( _TYPE_="COV" and  _NAME_="G_X_i "))  
	   		  outp_n_r(rename=(G_XYXY_i  = Cov_G23) where=( _TYPE_="COV" and  _NAME_="G_Y_i "));
		run;

	data estimates_n_r; 
		set var_n_r;
		A_n = (Mean_G1 + Mean_G2)/2;   * A (the numerator) in calculating Psi_N;
		B = Mean_G3;                              * B (the denominator) in calculating Psi_N and Psi_R;

		Est_Psi_N = A_N/B;
	
		V_G_Mean_1 = Std_G1**2/N;   * Var(mean of G1);
		V_G_Mean_2 = Std_G2**2/N;   * Var(mean of G2);
		V_G_Mean_3 = Std_G3**2/N;   * Var(mean of G3);

		Cov_G_Mean_12 = Cov_G12/N;  * Cov(mean of G1 and mean of G2);
		Cov_G_Mean_13 = Cov_G13/N;  * Cov(mean of G1 and mean of G3);
		Cov_G_Mean_23 = Cov_G23/N;  * Cov(mean of G2 and mean of G3);

 		Var_A_n = (Std_G1**2 + Std_G2**2 + 2*Cov_G12)/(4*N);
	 	Var_A_n2 = (V_G_Mean_1 + V_G_Mean_2  + 2*Cov_G_Mean_12)/4;

		Var_B = V_G_Mean_3;
	
		Cov_A_n_B = (Cov_G13 + Cov_G23)/(2*N);   * Cov(A, B);
		
		Var_Est_Psi_N = (A_n**2/B**2) * ( Var_A_n/A_n**2 + Var_B/B**2 - 2*Cov_A_n_B/(A_n*B)); 
		
		SE_Est_Psi_N = sqrt(Var_Est_Psi_n);

		alpha2 = 1-&alpha/2;
		z = probit(alpha2);
		CI_Lower_Est_Psi_N = Est_Psi_N - z*SE_Est_Psi_n;
		CI_Upper_Est_Psi_N = Est_Psi_N + z*SE_Est_Psi_n;
		
	run; 

	data estimates_n_r;
		set estimates_n_r;
		*if Est_Psi_N > 1 then CI_Upper_Est_Psi_N=.;
		*if Est_Psi_N > 1 then CI_Lower_Est_Psi_N=.;
		*if CI_Upper_Est_Psi_N > 1 then CI_Upper_Est_Psi_N = 1;
		lower_N=trim(left(put(CI_Lower_Est_Psi_N,5.3)));
		upper_N=trim(left(put(CI_Upper_Est_Psi_N,5.3)));
	
		CI_N= "("||lower_N||","||upper_N||")";
		label CI_N="95% CI for Psi_N";
		beta=(1-&alpha)*100;
		call symput ('percent',trim(left(put(beta,4.1))));
	run;

	data estimates_n_r;
		set estimates_n_r;
		label CI_N=&percent% CI for Psi_N;
	run;

	ods listing;
	ods select all;
		proc print data = estimates_n_r noobs label; 
			var Est_Psi_n SE_Est_Psi_n CI_N;
 			title2 "Estimated Psi_N";
		run;

*==========================================
Estimation of Psi_R: point estimate and SE
*==========================================;

data est_within_r; set data_xy_r ;
			array x(&rep_X) x1-x&rep_X;
			array y(&rep_Y) y1-y&rep_Y;
				wmean_x = mean(of x1-x&rep_X);
				wstd_x = std(of x1-x&rep_X);
				wvar_x=(wstd_x)**2;
				sum_diffsq=0;
				do mx=1 to &rep_X;
					do my=1 to &rep_Y;
						if x(mx)~=. and y(my)~=. then sum_diffsq = sum_diffsq + (x(mx)-y(my))**2;
					end;
				end;
				est_wmsd_xx = 2*wvar_x;
				est_wmsd_xy = sum_diffsq/(n_x * n_y);

			keep &id wmean_x  wvar_x est_wmsd_xx  est_wmsd_xy;   
run;

	ods trace off;
	ods select none;

		proc means mean data=est_within_r;
			var  est_wmsd_xx est_wmsd_xy ;
			ods output summary=overall_means_r;
		run;

	ods select all;

		data overall_estimates_r; set overall_means_r;
			est_msd_xx = est_wmsd_xx_Mean;
			est_msd_xy = est_wmsd_xy_Mean;
			keep  est_msd_xx  est_msd_xy; 
		run; 

		data overall_estimates_r (rename=(est_msd_xx=MSD_XX est_msd_xy=MSD_XY)); 
			set overall_estimates_r; 
			if &size_r <=10 then do;
         	put 'WARNING: Number of Subjetcs for Calculating Psi_R is less than 10';
			end;

		run;

		proc print data=overall_estimates_r noobs;
			var MSD_XX MSD_XY;
			title1 " &title ";
			title2 "Estimate of MSD Functions";
		run;

		data simdata_r (rename=(est_wmsd_xx=G_X_i  est_wmsd_xy=G_XYXY_i));
			set est_within_r;
			keep est_wmsd_xx  est_wmsd_xy  ;
		run;

		proc corr data=simdata_r cov outp = outp_r noprint;
			var G_X_i  G_XYXY_i;
		run;

		data var_r(keep=Mean_G1  Mean_G3 Std_G1 Std_G3 N Var_G1  Var_G3  Cov_G13);
			merge outp_r(rename=(G_X_i = Mean_G1) where=( _TYPE_="MEAN" ))  
			  outp_r(rename=(G_XYXY_i  = Mean_G3) where=( _TYPE_="MEAN" )) 
			  outp_r(rename=(G_X_i = Std_G1) where=( _TYPE_="STD" ))  
			  outp_r(rename=(G_XYXY_i  = Std_G3) where=( _TYPE_="STD" )) 
			  outp_r(rename=(G_X_i = N) where=( _TYPE_="N" )) 

			  outp_r(rename=(G_X_i = Var_G1) where=( _TYPE_="COV" and  _NAME_="G_X_i "))  
	  		  outp_r(rename=(G_XYXY_i  = Var_G3) where=( _TYPE_="COV" and  _NAME_="G_XYXY_i "))  
	   		  outp_r(rename=(G_XYXY_i  = Cov_G13) where=( _TYPE_="COV" and  _NAME_="G_X_i ")) ; 
		run;

		data estimates_r; 
			set var_r;
			A_r = Mean_G1;                           * A (the numerator) in calculating Psi_R;
			B = Mean_G3;                              * B (the denominator) in calculating Psi_N and Psi_R;

			Est_Psi_R = A_R/B;

			V_G_Mean_1 = Std_G1**2/N;   * Var(mean of G1);
			V_G_Mean_3 = Std_G3**2/N;   * Var(mean of G3);

			Cov_G_Mean_13 = Cov_G13/N;  * Cov(mean of G1 and mean of G3);

			Var_A_r = V_G_Mean_1;
			Var_B = V_G_Mean_3;
	
			Cov_A_r_B = Cov_G13/N;   

			Var_Est_Psi_R = (A_r**2/B**2) * ( Var_A_r/A_r**2 + Var_B/B**2 - 2*Cov_A_r_B/(A_r*B)); 

			SE_Est_Psi_R = sqrt(Var_Est_Psi_R);

			alpha2 = 1-&alpha/2;
			z = probit(alpha2);
			
			CI_Lower_Est_Psi_R =  Est_Psi_R - z*SE_Est_Psi_R;
			CI_Upper_Est_Psi_R =  Est_Psi_R + z*SE_Est_Psi_R;
		run; 

		data estimates_r;
			set estimates_r;
			lower=trim(left(put(CI_Lower_Est_Psi_R,5.3)));
			upper=trim(left(put(CI_Upper_Est_Psi_R,5.3)));
		    CI="("||lower||","||upper||")";
			label CI="95% CI for Psi_R";
			beta=(1-&alpha)*100;
			call symput ('percent',trim(left(put(beta,4.1))));
		run;

	data estimates_r;
		set estimates_r;
		label CI=&percent% CI for Psi_R;
	run;

		ods listing;
		ods select all;

		proc print data = estimates_r noobs label; 
			var Est_Psi_R SE_Est_Psi_R CI;
 			title2 "Estimated Psi_R";
		run;
		title;
%mend;