-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetFullcredits.awk
90 lines (78 loc) · 1.6 KB
/
getFullcredits.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Get credits data from the "Full Cast & Crew" page
#
# Used to debug possibly missing data from the .tsv.gz files
#
# Grab title, grab people by category, add characters for actors
# Process director, writer, actor, producer
# Exit when composer is encountered to cut reading ~50% of file.
/<meta property="pageId"/ {
split($0, fld, "\"")
tconstID = fld[4]
next
}
/<meta name="title" content=/ {
split($0, fld, "\"")
showTitle = fld[4]
sub(/ \(.*/, "", showTitle)
next
}
/name="director" id="director"/ {
category = "director"
rank = 0
next
}
/name="writer" id="writer"/ {
category = "writer"
rank = 0
next
}
/name="cast" id="cast"/ {
category = "actor"
rank = 0
next
}
/name="producer" id="producer"/ {
category = "producer"
rank = 0
next
}
/^<a href="\/name\// {
split($0, fld, "/")
nconstID = fld[3]
getline
if ($0 ~ /><img height/) next
sub(/> /, "")
name = $0
if (category != "actor" && name != previousName) {
rank += 1
printf(\
"%s\t%s\t\t%02d\t%s\t\t%s\t%s\n",
name,
showTitle,
rank,
category,
nconstID,
tconstID\
)
}
previousName = name
next
}
/<a href="\/title\/tt.*\/characters\/nm/ {
rank += 1
#
split($0, fld, "[<>]")
character = fld[3]
printf(\
"%s\t%s\t\t%02d\t%s\t%s\t%s\t%s\n",
name,
showTitle,
rank,
category,
character,
nconstID,
tconstID\
)
next
}
/name="composer" id="composer"/ { exit }