Upload
jiachen-yang
View
206
Download
0
Tags:
Embed Size (px)
Citation preview
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
.
......
Filtering Clones forIndividual User Based on
Machine Learning Analysis
Jiachen Yang, Keisuke Hotta, Yoshiki Higo,Hiroshi Igaki, Shinji Kusumoto
Graduate School of Information Science and Technology, Osaka University
June 4, 2012
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 1 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Motivating Example
ClonesetsRed: Un-interesting
Blue: Interesting
Participants of survey
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 2 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Motivating Example
ClonesetsRed: Un-interesting
Blue: Interesting
Participants of survey1 2 3 4 5 6 7 8
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 2 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Motivating Example
ClonesetsRed: Un-interesting
Blue: Interesting
Participants of survey
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 2 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Interesting U:0 vs I:8
1542 static .char *..1543 .history_substring (string , start , end)..1544 . const char *string;..1545 . int start , end;..1546 .{..1547 . register int len ;..1548 . register char *result ;..1549 . len = end − start;..1550 . result = (char *)xmalloc (len + 1);..1551 . strncpy ( result , string + start, len);..1552 . result [ len] = '\0';..1553 . return result ;..1554 .}..
(a) lib/readline/histexpand.c
126 .char *..127 .substring (string , start , end)..128 . const char *string;..129 . int start , end;..130 .{..131 . register int len ;..132 . register char *result ;..133 . len = end − start;..134 . result = (char *)xmalloc (len + 1);..135 . strncpy ( result , string + start, len);..136 . result [ len] = '\0';..137 . return ( result );..138 .}..
(b) stringlib.c
Figure: Example of source code in bash-4.2
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 3 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Un-Interesting U:8 vs I:0
191 ... __P((char *, arrayind_t, .char *));..192 .static intmax_t subexpr __P((char *));..193 .static intmax_t expcomma __P((void));..194 .static intmax_t expassign __P((void));..195 .static intmax_t expcond __P((void));..196 .static intmax_t explor __P((void));..197 .static intmax_t expland __P((void..));
(a) expr.c
309 static int run_one_command __P((.char *));..310 .static int run_wordexp __P((char *));..311 .static int uidget __P((void));..312 .static void init_interactive __P((void));..313 .static void init_noninteractive __P((void));..314 .static void init_interactive_script __P((void));..315 .static void set_shell_name __P((char..*));
(b) shell.c
Figure: Example of source code in bash-4.2
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 4 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Disagreed U:4 vs I:4
2710 static int2711 displen (s)2712 const char *s;2713 {2714 wchar_t *wcstr;2715 size_t wclen, slen ;2716 wcstr = 0.;..2717 .slen = mbstowcs (wcstr, s, 0);..2718 .if (slen == −1)..2719 . slen = 0;..2720 .wcstr = (wchar_t *)xmalloc (sizeof .....2721 .mbstowcs (wcstr, s, slen + 1);..2722 wclen = wcswidth (wcstr, slen);2723 free (wcstr);2724 return (( int)wclen);2725 }
(a) execute_cmd.c
1098 else1099 {1100 if (wcharlist == 0)1101 {1102 size_t len.;..1103 . len = mbstowcs (wcharlist, charlist , 0);..1104 . if (len == −1)..1105 . len = 0;..1106 . wcharlist = (wchar_t *)xmalloc (sizeof .....1107 . mbstowcs (wcharlist, charlist , len + 1);..1108 }1109 if (wcschr (wcharlist , wc))1110 break;1111 }
(b) subst.c
Figure: Example of source code in bash-4.2
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 5 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Fica — the name
Filter forIndividual user on codeCloneAnalysis
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 6 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Fica — the website
Figure: Snapshot of Fica
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 7 / 14
.......... ...... ..................... ..................... ..................... ..................... ................
.......... ...... ..................... ..................... ..................... ..................... ................
.......... ...... ..................... ..................... ..................... ..................... ................
.......... ...... ..................... ..................... ..................... ..................... ................
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Compare Code Clone Similarity
Pi = possibility to be interestingPu = possibility to be un-interestingLen Pi Pi/Pu Pu Comp50 5.56% 1.18 4.72% O87 2.89% 1.11 2.59% O79 1.97% 0.69 2.87% X63 3.55% 0.64 5.57% O77 2.66% 0.46 5.83% X
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 11 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Good Experiment ResultAll training 44All evaluation 34
Matched 32Accuracy 94.12%
un-interesting 1interesting 1
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 12 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Bad Experiment ResultAll training 47All evaluation 31
Matched 14Accuracy 45.16%
un-interesting 16interesting 1
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 13 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Open Question
How to improve accuracy?By combining metrics like McCabe CyclomaticComplexity?
Thank you!
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 14 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Unmatched: User un-interesting
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 15 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Unmatched: User interesting
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 16 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Overall Workflow...1 Submits source code...2 Detects clones...3 Mark clones as “interesting”
or not...4 Records marked clones into
database...5 Studies characteristics of
marks using machine learningalgorithms
...6 Ranks unmarked clones basedon machine learning
Figure: Overall Workflowof Fica with CDT
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 17 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Calc Similarity of Clones
tf(t, d) =|t : t ∈ d|
|d| (1)
idf(t,D) = log |D|1 + |d ∈ D : t ∈ d| (2)
tfidf(t, d,D) = tf(t, d)× idf(t,D) (3)−−−−−−→tfidf(d,D) = [tfidf(t, d,D) ∀t ∈ d] (4)
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 18 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Predicting Category
sim(a, b,D) =−−−−−−→tfidf(a,D) ·
−−−−−−→tfidf(b,D) (5)
nsim(a, b,D) =
{0 , sim(a, b,D) = 0
sim(a,b,D)|sim(a,b,D)| , otherwise (6)
poss(t,M) =
{1 , |M| = 0∑
∀m∈M nsim(t,m,M)
|M| , otherwise (7)
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 19 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Result — bashA B C D E F G H
10 20 30 40 50 60 70 80 90 100Percentage of Training Set (%)
0
25
50
75
100A
ccu
racy
(%
)
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 20 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Result — gitA B C D E F G H
10 20 30 40 50 60 70 80 90 100Percentage of Training Set (%)
0
25
50
75
100A
ccu
racy
(%
)
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 21 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Result — xzA B C D E F G H
10 20 30 40 50 60 70 80 90 100Percentage of Training Set (%)
0
25
50
75
100A
ccu
racy
(%
)
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 22 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Result — e2fsprogsA B C D E F G H
10 20 30 40 50 60 70 80 90 100Percentage of Training Set (%)
0
25
50
75
100A
ccu
racy
(%
)
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 23 / 14
..........
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
.....
.....
......
.....
..........
......
.....
.....
.
Result — All ProjectsA B C D E F G H
10 20 30 40 50 60 70 80 90 100Percentage of Training Set (%)
0
25
50
75
100A
ccu
racy
(%
)
Jiachen Yang (IST, Osaka-U) Fica@IWSC2012 June 4, 2012 24 / 14