Date: Tue, 1 Oct 2019 06:42:41 +1000
I'm releasing a second artifact from ACTCD19 today I call "PERFILE" which
estimates the per-source-file prevalence of C/C++ tokens.
For each of the 47,705,063 unique token spellings, we count how many of
the 2,489,599 source files contain at least one occurence of the token, we
then divide by the total number of source files to determine the percentage
of source files that contain at least one occurence of the token.
For example, 41% of C/C++ source files contain one or more occurences of
the keyword `else`. Therefore, it follows that 59% of C/C++ source files
do not contain the keyword `else`.
Tokens with greater than 0.01% (occur in more than 1 in every 10,000 source
files) prevelance are listed in ACTCD19-PERFILE.txt which is attached and
also available via the following link:
https://raw.githubusercontent.com/tomazos/actcd19/master/ACTCD19-PERFILE.txt
For a random samples of occurences of a given token you can use
codesearch.isocpp.org.
The TOP 300 are pasted below. (TOKLEN is just the byte length of the token
for machine-reading)
PERCENT TOKLEN TOKEN
97.1256 1 #
95.8277 1 ;
95.4258 1 (
95.4252 1 )
90.9156 1 }
90.9142 1 {
90.535 7 include
87.8798 1 ,
80.5935 1 *
72.3236 4 void
72.0929 1 =
65.7425 5 const
64.5512 1 &
62.1809 1 0
61.3398 6 return
60.7381 3 int
58.7883 1 :
58.3704 5 endif
57.1048 2 if
55.8634 6 define
55.3522 1 <
52.0347 1 1
48.8295 1 >
46.1672 1 .
45.3091 1 [
45.3088 1 ]
43.2325 2 ->
43.1835 2 ==
43.0489 2 ::
41.2506 6 ifndef
40.6642 4 else
40.2556 4 char
37.8444 1 !
37.62 1 -
36.7728 6 static
35.4269 2 !=
34.1355 1 +
33.7223 1 2
33.4819 4 bool
33.4318 2 ++
32.695 2 &&
32.1592 6 struct
31.436 3 for
29.8759 9 namespace
29.2479 1 ~
28.0267 5 class
26.7104 2 ||
26.5191 1 i
26.1835 4 NULL
25.5951 6 public
23.7209 1 3
23.0291 8 unsigned
21.6406 3 std
21.5988 1 4
21.2915 5 false
20.7938 4 true
20.7838 5 break
20.6971 1 ?
20.4055 6 sizeof
20.2566 5 ifdef
20.0997 5 while
18.9025 2 >=
18.8148 2 +=
18.6324 2 <<
18.3964 4 size
18.3067 7 typedef
17.2047 1 /
17.0467 7 private
16.9848 4 name
16.0492 6 size_t
15.9981 6 switch
15.9651 4 case
15.7386 4 data
15.2661 1 8
14.9547 1 5
14.812 2 <=
14.411 4 type
14.3009 1 |
14.2562 7 default
13.5917 3 new
13.423 1 x
13.1595 4 this
12.8055 6 string
12.638 6 double
12.6224 5 value
12.4951 2 10
12.1794 4 enum
12.0366 1 6
11.4818 4 long
11.2566 6 extern
11.0233 2 16
10.9757 2 --
10.9749 1 p
10.6333 7 virtual
10.4838 5 using
10.4255 1 s
10.1317 1 n
9.93525 1 7
9.76454 2 >>
9.61581 7 defined
9.60613 3 end
9.5623 2 ""
9.48382 8 continue
9.28961 1 c
9.23719 9 <stdio.h>
9.15662 3 len
9.04013 10 <string.h>
8.66356 9 protected
8.65312 1 a
8.6105 10 <stdlib.h>
8.51165 1 b
8.46984 1 y
8.41521 6 vector
8.33202 6 parent
8.23494 6 inline
8.03579 4 main
8.00691 5 flags
7.9359 2 -=
7.87729 5 count
7.86078 6 result
7.72614 2 32
7.72546 5 error
7.70867 1 j
7.63806 3 buf
7.54322 2 id
7.53539 8 operator
7.49691 2 |=
7.47205 8 uint32_t
7.30431 5 float
7.21538 7 nullptr
7.16842 8 template
7.086 6 length
7.08568 4 goto
6.99334 3 ret
6.93405 1 9
6.85713 3 str
6.856 6 delete
6.79929 4 argv
6.71654 11 static_cast
6.69779 2 12
6.66625 6 strlen
6.62721 5 index
6.59375 6 memset
6.59178 4 argc
6.56873 4 free
6.56158 1 r
6.50229 1 d
6.42083 7 QString
6.37432 1 %
6.36934 3 100
6.25097 6 memcpy
6.22253 1 t
6.2055 5 begin
6.17521 2 20
6.06282 4 next
6.03021 6 offset
5.96787 2 do
5.95124 1 f
5.90842 3 get
5.65135 4 '\0'
5.60677 3 key
5.58134 5 clear
5.57773 1 v
5.56596 6 strcmp
5.56439 6 assert
5.54917 3 val
5.5386 4 base
5.53583 5 start
5.4118 7 uint8_t
5.40151 8 override
5.39677 5 state
5.35496 5 FALSE
5.33222 6 printf
5.31813 5 width
5.29909 4 TRUE
5.26703 2 11
5.25711 2 15
5.19919 3 256
5.15902 2 64
5.09668 9 push_back
5.08447 7 fprintf
5.07214 3 0.0
5.05989 6 buffer
5.05306 4 "\n"
4.97602 3 out
4.93754 1 e
4.88545 5 undef
4.80965 6 status
4.78736 6 pragma
4.78451 4 path
4.77394 8 typename
4.77209 3 pos
4.7502 2 24
4.72839 1 m
4.72827 4 auto
4.72811 3 1.0
4.72466 4 FILE
4.71972 10 "config.h"
4.69943 5 empty
4.68814 4 info
4.58881 6 height
4.42758 2 &=
4.38769 4 list
4.36853 3 arg
4.36709 5 first
4.3591 1 k
4.34889 6 stderr
4.33375 10 <unistd.h>
4.30832 8 explicit
4.29555 1 w
4.28651 2 13
4.26294 4 1024
4.23803 3 max
4.23213 4 file
4.19995 4 mode
4.19915 2 14
4.1875 3 err
4.18156 4 1000
4.13589 1 h
4.10701 3 ...
4.10187 3 tmp
4.07768 3 ptr
4.07431 3 128
4.05013 8 filename
4.02326 6 malloc
3.9814 4 exit
3.97719 3 "C"
3.96775 1 l
3.914 5 close
3.89436 8 <string>
3.83018 3 map
3.77539 3 set
3.77113 4 text
3.75599 4 init
3.75502 5 c_str
3.70618 2 30
3.69811 3 res
3.68075 2 *=
3.67039 1 T
3.66083 3 src
3.65501 5 short
3.61046 8 uint64_t
3.58757 3 msg
3.52503 6 format
3.50109 5 event
3.4957 4 args
3.47401 9 <errno.h>
3.42481 8 <vector>
3.39625 5 errno
3.3942 3 min
3.38842 8 Q_OBJECT
3.37584 5 reset
3.37524 3 ' '
3.37167 7 QObject
3.36693 4 endl
3.32262 7 context
3.31921 4 line
3.30507 6 insert
3.29804 7 QWidget
3.28334 6 second
3.23301 10 unique_ptr
3.21899 4 find
3.21831 3 obj
3.19706 8 iterator
3.19349 4 addr
3.15689 4 read
3.14999 2 fd
3.14251 7 message
3.13946 2 it
3.11347 13 <sys/types.h>
3.08375 2 50
3.03804 2 17
3.03599 4 0xff
3.028 8 gboolean
3.01382 4 '\n'
2.99864 8 uint16_t
2.99655 6 append
2.9945 4 once
2.97803 4 open
2.9653 2 18
2.96453 7 isEmpty
2.9559 5 write
2.95546 7 sprintf
2.93702 4 node
2.82001 3 255
2.81507 13 HAVE_CONFIG_H
2.81499 5 boost
2.81065 3 " "
2.80917 4 time
2.79439 4 0x01
2.78181 7 connect
2.77037 10 <config.h>
2.75382 7 int32_t
2.75201 11 __cplusplus
2.74964 6 String
2.73896 1 _
2.73574 2 31
2.73305 4 0x80
2.71341 7 version
2.67485 5 throw
Enjoy,
Andrew.
estimates the per-source-file prevalence of C/C++ tokens.
For each of the 47,705,063 unique token spellings, we count how many of
the 2,489,599 source files contain at least one occurence of the token, we
then divide by the total number of source files to determine the percentage
of source files that contain at least one occurence of the token.
For example, 41% of C/C++ source files contain one or more occurences of
the keyword `else`. Therefore, it follows that 59% of C/C++ source files
do not contain the keyword `else`.
Tokens with greater than 0.01% (occur in more than 1 in every 10,000 source
files) prevelance are listed in ACTCD19-PERFILE.txt which is attached and
also available via the following link:
https://raw.githubusercontent.com/tomazos/actcd19/master/ACTCD19-PERFILE.txt
For a random samples of occurences of a given token you can use
codesearch.isocpp.org.
The TOP 300 are pasted below. (TOKLEN is just the byte length of the token
for machine-reading)
PERCENT TOKLEN TOKEN
97.1256 1 #
95.8277 1 ;
95.4258 1 (
95.4252 1 )
90.9156 1 }
90.9142 1 {
90.535 7 include
87.8798 1 ,
80.5935 1 *
72.3236 4 void
72.0929 1 =
65.7425 5 const
64.5512 1 &
62.1809 1 0
61.3398 6 return
60.7381 3 int
58.7883 1 :
58.3704 5 endif
57.1048 2 if
55.8634 6 define
55.3522 1 <
52.0347 1 1
48.8295 1 >
46.1672 1 .
45.3091 1 [
45.3088 1 ]
43.2325 2 ->
43.1835 2 ==
43.0489 2 ::
41.2506 6 ifndef
40.6642 4 else
40.2556 4 char
37.8444 1 !
37.62 1 -
36.7728 6 static
35.4269 2 !=
34.1355 1 +
33.7223 1 2
33.4819 4 bool
33.4318 2 ++
32.695 2 &&
32.1592 6 struct
31.436 3 for
29.8759 9 namespace
29.2479 1 ~
28.0267 5 class
26.7104 2 ||
26.5191 1 i
26.1835 4 NULL
25.5951 6 public
23.7209 1 3
23.0291 8 unsigned
21.6406 3 std
21.5988 1 4
21.2915 5 false
20.7938 4 true
20.7838 5 break
20.6971 1 ?
20.4055 6 sizeof
20.2566 5 ifdef
20.0997 5 while
18.9025 2 >=
18.8148 2 +=
18.6324 2 <<
18.3964 4 size
18.3067 7 typedef
17.2047 1 /
17.0467 7 private
16.9848 4 name
16.0492 6 size_t
15.9981 6 switch
15.9651 4 case
15.7386 4 data
15.2661 1 8
14.9547 1 5
14.812 2 <=
14.411 4 type
14.3009 1 |
14.2562 7 default
13.5917 3 new
13.423 1 x
13.1595 4 this
12.8055 6 string
12.638 6 double
12.6224 5 value
12.4951 2 10
12.1794 4 enum
12.0366 1 6
11.4818 4 long
11.2566 6 extern
11.0233 2 16
10.9757 2 --
10.9749 1 p
10.6333 7 virtual
10.4838 5 using
10.4255 1 s
10.1317 1 n
9.93525 1 7
9.76454 2 >>
9.61581 7 defined
9.60613 3 end
9.5623 2 ""
9.48382 8 continue
9.28961 1 c
9.23719 9 <stdio.h>
9.15662 3 len
9.04013 10 <string.h>
8.66356 9 protected
8.65312 1 a
8.6105 10 <stdlib.h>
8.51165 1 b
8.46984 1 y
8.41521 6 vector
8.33202 6 parent
8.23494 6 inline
8.03579 4 main
8.00691 5 flags
7.9359 2 -=
7.87729 5 count
7.86078 6 result
7.72614 2 32
7.72546 5 error
7.70867 1 j
7.63806 3 buf
7.54322 2 id
7.53539 8 operator
7.49691 2 |=
7.47205 8 uint32_t
7.30431 5 float
7.21538 7 nullptr
7.16842 8 template
7.086 6 length
7.08568 4 goto
6.99334 3 ret
6.93405 1 9
6.85713 3 str
6.856 6 delete
6.79929 4 argv
6.71654 11 static_cast
6.69779 2 12
6.66625 6 strlen
6.62721 5 index
6.59375 6 memset
6.59178 4 argc
6.56873 4 free
6.56158 1 r
6.50229 1 d
6.42083 7 QString
6.37432 1 %
6.36934 3 100
6.25097 6 memcpy
6.22253 1 t
6.2055 5 begin
6.17521 2 20
6.06282 4 next
6.03021 6 offset
5.96787 2 do
5.95124 1 f
5.90842 3 get
5.65135 4 '\0'
5.60677 3 key
5.58134 5 clear
5.57773 1 v
5.56596 6 strcmp
5.56439 6 assert
5.54917 3 val
5.5386 4 base
5.53583 5 start
5.4118 7 uint8_t
5.40151 8 override
5.39677 5 state
5.35496 5 FALSE
5.33222 6 printf
5.31813 5 width
5.29909 4 TRUE
5.26703 2 11
5.25711 2 15
5.19919 3 256
5.15902 2 64
5.09668 9 push_back
5.08447 7 fprintf
5.07214 3 0.0
5.05989 6 buffer
5.05306 4 "\n"
4.97602 3 out
4.93754 1 e
4.88545 5 undef
4.80965 6 status
4.78736 6 pragma
4.78451 4 path
4.77394 8 typename
4.77209 3 pos
4.7502 2 24
4.72839 1 m
4.72827 4 auto
4.72811 3 1.0
4.72466 4 FILE
4.71972 10 "config.h"
4.69943 5 empty
4.68814 4 info
4.58881 6 height
4.42758 2 &=
4.38769 4 list
4.36853 3 arg
4.36709 5 first
4.3591 1 k
4.34889 6 stderr
4.33375 10 <unistd.h>
4.30832 8 explicit
4.29555 1 w
4.28651 2 13
4.26294 4 1024
4.23803 3 max
4.23213 4 file
4.19995 4 mode
4.19915 2 14
4.1875 3 err
4.18156 4 1000
4.13589 1 h
4.10701 3 ...
4.10187 3 tmp
4.07768 3 ptr
4.07431 3 128
4.05013 8 filename
4.02326 6 malloc
3.9814 4 exit
3.97719 3 "C"
3.96775 1 l
3.914 5 close
3.89436 8 <string>
3.83018 3 map
3.77539 3 set
3.77113 4 text
3.75599 4 init
3.75502 5 c_str
3.70618 2 30
3.69811 3 res
3.68075 2 *=
3.67039 1 T
3.66083 3 src
3.65501 5 short
3.61046 8 uint64_t
3.58757 3 msg
3.52503 6 format
3.50109 5 event
3.4957 4 args
3.47401 9 <errno.h>
3.42481 8 <vector>
3.39625 5 errno
3.3942 3 min
3.38842 8 Q_OBJECT
3.37584 5 reset
3.37524 3 ' '
3.37167 7 QObject
3.36693 4 endl
3.32262 7 context
3.31921 4 line
3.30507 6 insert
3.29804 7 QWidget
3.28334 6 second
3.23301 10 unique_ptr
3.21899 4 find
3.21831 3 obj
3.19706 8 iterator
3.19349 4 addr
3.15689 4 read
3.14999 2 fd
3.14251 7 message
3.13946 2 it
3.11347 13 <sys/types.h>
3.08375 2 50
3.03804 2 17
3.03599 4 0xff
3.028 8 gboolean
3.01382 4 '\n'
2.99864 8 uint16_t
2.99655 6 append
2.9945 4 once
2.97803 4 open
2.9653 2 18
2.96453 7 isEmpty
2.9559 5 write
2.95546 7 sprintf
2.93702 4 node
2.82001 3 255
2.81507 13 HAVE_CONFIG_H
2.81499 5 boost
2.81065 3 " "
2.80917 4 time
2.79439 4 0x01
2.78181 7 connect
2.77037 10 <config.h>
2.75382 7 int32_t
2.75201 11 __cplusplus
2.74964 6 String
2.73896 1 _
2.73574 2 31
2.73305 4 0x80
2.71341 7 version
2.67485 5 throw
Enjoy,
Andrew.
Received on 2019-09-30 15:45:09