Skip to main content

chatsounds/
fetching.rs

1use std::path::{Component, Path};
2
3use serde::Deserialize;
4
5use crate::{
6    Chatsounds, Error, cache::download, error::Result, parsing::normalize_sentence,
7    types::Chatsound,
8};
9
10#[derive(Deserialize)]
11pub struct GitHubApiFileEntry {
12    pub path: String,
13    pub r#type: String,
14    pub size: Option<usize>,
15}
16
17#[derive(Deserialize)]
18pub struct GitHubApiTrees {
19    pub tree: Vec<GitHubApiFileEntry>,
20}
21
22pub type GitHubMsgpackEntries = Vec<Vec<String>>;
23
24/// Extract `(sentence_key, sound_path)` from a GitHub-API tree entry path,
25/// returning `None` for entries that should not become a `map_store` key:
26/// outside `repo_path`, top-level files (no category dir), dotfiles like
27/// `.gitkeep`, or names that normalize to an empty key.
28///
29/// Mirrors the existing `<repo_path>/<category>/<file>` schema — the
30/// second component of the relative path is what becomes the key. For
31/// nested files (`<cat>/<subcat>/<file>`) the subcat name is the key,
32/// which is how multiple takes of the same sound share a lookup key.
33fn parse_api_entry(repo_path: &str, entry_path: &str) -> Option<(String, String)> {
34    let suffix = entry_path
35        .strip_prefix(repo_path)
36        .and_then(|s| s.strip_prefix('/'))?;
37    let path = Path::new(suffix);
38    let Component::Normal(filename) = path.components().nth(1)? else {
39        return None;
40    };
41    if filename.to_string_lossy().starts_with('.') {
42        return None;
43    }
44    let stem = Path::new(filename).file_stem()?;
45    let key = normalize_sentence(&stem.to_string_lossy());
46    if key.is_empty() {
47        return None;
48    }
49    Some((key, suffix.to_string()))
50}
51
52impl Chatsounds {
53    pub async fn fetch_github_api(&self, repo: &str, _repo_path: &str) -> Result<GitHubApiTrees> {
54        let api_url = format!("https://api.github.com/repos/{repo}/git/trees/HEAD?recursive=1");
55
56        #[cfg(feature = "fs")]
57        let cache = &self.cache_path;
58        #[cfg(feature = "memory")]
59        let cache = self.fs_memory.clone();
60
61        tracing::debug!(repo, "fetching GitHub tree API");
62        let bytes = download(&api_url, cache, false).await?;
63
64        let trees: GitHubApiTrees =
65            serde_json::from_slice(&bytes).map_err(|err| Error::Json { err, url: api_url })?;
66
67        tracing::debug!(repo, entries = trees.tree.len(), "fetched GitHub tree API");
68        Ok(trees)
69    }
70
71    pub fn load_github_api(
72        &mut self,
73        repo: &str,
74        repo_path: &str,
75        trees: GitHubApiTrees,
76    ) -> Result<()> {
77        let mut added = 0usize;
78        for entry in trees.tree {
79            if entry.r#type != "blob" {
80                continue;
81            }
82            let Some((sentence, sound_path)) = parse_api_entry(repo_path, &entry.path) else {
83                continue;
84            };
85
86            let vec = self.map_store.entry(sentence).or_default();
87            let chatsound = Chatsound {
88                repo: repo.to_string(),
89                repo_path: repo_path.to_string(),
90                sound_path,
91            };
92
93            let url = chatsound.get_url();
94            match vec.binary_search_by(|c| c.get_url().cmp(&url)) {
95                Ok(_pos) => {
96                    // already exists, don't add again
97                }
98                Err(pos) => {
99                    vec.insert(pos, chatsound);
100                    added += 1;
101                }
102            }
103        }
104
105        tracing::debug!(
106            repo,
107            repo_path,
108            added,
109            total_keys = self.map_store.len(),
110            "loaded chatsounds from GitHub tree API"
111        );
112        Ok(())
113    }
114
115    pub async fn fetch_github_msgpack(
116        &self,
117        repo: &str,
118        repo_path: &str,
119    ) -> Result<GitHubMsgpackEntries> {
120        let msgpack_url =
121            format!("https://raw.githubusercontent.com/{repo}/HEAD/{repo_path}/list.msgpack");
122
123        #[cfg(feature = "fs")]
124        let cache = &self.cache_path;
125        #[cfg(feature = "memory")]
126        let cache = self.fs_memory.clone();
127
128        tracing::debug!(repo, repo_path, "fetching list.msgpack");
129        let bytes = download(&msgpack_url, cache, false).await?;
130        let entries: GitHubMsgpackEntries =
131            rmp_serde::decode::from_slice(&bytes).map_err(|err| Error::Msgpack {
132                err,
133                url: msgpack_url,
134            })?;
135
136        tracing::debug!(
137            repo,
138            repo_path,
139            entries = entries.len(),
140            "fetched list.msgpack"
141        );
142        Ok(entries)
143    }
144
145    pub fn load_github_msgpack(
146        &mut self,
147        repo: &str,
148        repo_path: &str,
149        entries: GitHubMsgpackEntries,
150    ) -> Result<()> {
151        let mut added = 0usize;
152        for entry in entries {
153            // e26/stop.ogg or e26/nestetrismusic/1.ogg
154            let sentence = normalize_sentence(&entry[1]);
155            if sentence.is_empty() {
156                continue;
157            }
158            let sound_path = entry[2].clone();
159            let vec = self.map_store.entry(sentence).or_default();
160
161            let chatsound = Chatsound {
162                repo: repo.to_string(),
163                repo_path: repo_path.to_string(),
164                sound_path,
165            };
166
167            let url = chatsound.get_url();
168            match vec.binary_search_by(|c| c.get_url().cmp(&url)) {
169                Ok(_pos) => {
170                    // already exists, don't add again
171                }
172                Err(pos) => {
173                    vec.insert(pos, chatsound);
174                    added += 1;
175                }
176            }
177        }
178
179        tracing::debug!(
180            repo,
181            repo_path,
182            added,
183            total_keys = self.map_store.len(),
184            "loaded chatsounds from list.msgpack"
185        );
186        Ok(())
187    }
188}
189
190#[cfg(test)]
191mod tests {
192    use super::parse_api_entry;
193
194    #[test]
195    fn weird_filenames_from_user() {
196        let repo = "sounds";
197
198        assert_eq!(
199            parse_api_entry(repo, "sounds/sammich/yes no yes no YES NO.ogg"),
200            Some((
201                "yes no yes no YES NO".to_string(),
202                "sammich/yes no yes no YES NO.ogg".to_string(),
203            )),
204        );
205        assert_eq!(
206            parse_api_entry(
207                repo,
208                "sounds/bill_wurtz/hell yeah now we've got business.ogg"
209            ),
210            Some((
211                "hell yeah now weve got business".to_string(),
212                "bill_wurtz/hell yeah now we've got business.ogg".to_string(),
213            )),
214        );
215        assert_eq!(
216            parse_api_entry(repo, "sounds/piano ogg/0-a.ogg"),
217            Some(("0 a".to_string(), "piano ogg/0-a.ogg".to_string())),
218        );
219        assert_eq!(
220            parse_api_entry(repo, "sounds/test/ file test  .ogg"),
221            Some(("file test".to_string(), "test/ file test  .ogg".to_string())),
222        );
223        assert_eq!(
224            parse_api_entry(repo, "sounds/test/!file_test!.ogg"),
225            Some(("file test".to_string(), "test/!file_test!.ogg".to_string())),
226        );
227    }
228
229    #[test]
230    fn dotfiles_are_skipped() {
231        let repo = "sounds";
232        assert_eq!(parse_api_entry(repo, "sounds/sammich/.gitkeep"), None);
233        assert_eq!(parse_api_entry(repo, "sounds/cat/.DS_Store"), None);
234        assert_eq!(parse_api_entry(repo, "sounds/cat/.gitignore"), None);
235        // Even a `.hidden.ogg` is a dotfile per Unix conventions.
236        assert_eq!(parse_api_entry(repo, "sounds/cat/.hidden.ogg"), None);
237    }
238
239    #[test]
240    fn top_level_files_are_skipped() {
241        // The existing schema expects `<repo_path>/<category>/<file>`; files
242        // directly under repo_path (no category) are dropped.
243        let repo = "sounds";
244        assert_eq!(parse_api_entry(repo, "sounds/README.md"), None);
245        assert_eq!(parse_api_entry(repo, "sounds/file.ogg"), None);
246    }
247
248    #[test]
249    fn outside_repo_path_is_skipped() {
250        let repo = "sounds";
251        assert_eq!(parse_api_entry(repo, "other/cat/file.ogg"), None);
252        // Prefix match without the trailing slash boundary must not pass.
253        assert_eq!(parse_api_entry(repo, "soundsthing/cat/file.ogg"), None);
254        assert_eq!(parse_api_entry(repo, "sounds"), None);
255        assert_eq!(parse_api_entry(repo, ""), None);
256    }
257
258    #[test]
259    fn empty_after_normalization_is_skipped() {
260        let repo = "sounds";
261        assert_eq!(parse_api_entry(repo, "sounds/cat/!!!.ogg"), None);
262        assert_eq!(parse_api_entry(repo, "sounds/cat/'''.ogg"), None);
263        assert_eq!(parse_api_entry(repo, "sounds/cat/   .ogg"), None);
264    }
265
266    #[test]
267    fn subdir_takes_subdir_name_as_key() {
268        // Existing behavior — `<cat>/<subcat>/<file>` keys by subcat, so all
269        // `1.ogg`, `2.ogg`... in `e26/nestetrismusic/` share key
270        // `nestetrismusic`.
271        let repo = "sounds";
272        assert_eq!(
273            parse_api_entry(repo, "sounds/e26/nestetrismusic/1.ogg"),
274            Some((
275                "nestetrismusic".to_string(),
276                "e26/nestetrismusic/1.ogg".to_string(),
277            )),
278        );
279    }
280
281    #[test]
282    fn category_with_punctuation_still_works() {
283        // The category dir name lands in `sound_path`, not the key; the key
284        // only cares about the file leaf.
285        let repo = "sounds";
286        assert_eq!(
287            parse_api_entry(repo, "sounds/!weird cat!/hello.ogg"),
288            Some(("hello".to_string(), "!weird cat!/hello.ogg".to_string())),
289        );
290    }
291
292    #[test]
293    fn file_with_no_extension_keeps_whole_name() {
294        let repo = "sounds";
295        assert_eq!(
296            parse_api_entry(repo, "sounds/cat/foo"),
297            Some(("foo".to_string(), "cat/foo".to_string())),
298        );
299    }
300
301    #[test]
302    fn commas_in_filename_drop_without_space() {
303        // Commas inside a filename collapse the surrounding tokens into one
304        // word — like apostrophes — so `1,000 meme.ogg` keys to `1000 meme`
305        // (the space between words is preserved on its own).
306        let repo = "sounds";
307        assert_eq!(
308            parse_api_entry(repo, "sounds/cat/1,000 meme.ogg"),
309            Some(("1000 meme".to_string(), "cat/1,000 meme.ogg".to_string())),
310        );
311        assert_eq!(
312            parse_api_entry(repo, "sounds/cat/a,b,c.ogg"),
313            Some(("abc".to_string(), "cat/a,b,c.ogg".to_string())),
314        );
315        assert_eq!(
316            parse_api_entry(repo, "sounds/cat/yes,no,maybe.ogg"),
317            Some(("yesnomaybe".to_string(), "cat/yes,no,maybe.ogg".to_string(),)),
318        );
319        // Comma + apostrophe in the same filename.
320        assert_eq!(
321            parse_api_entry(repo, "sounds/cat/we've got 1,000.ogg"),
322            Some((
323                "weve got 1000".to_string(),
324                "cat/we've got 1,000.ogg".to_string(),
325            )),
326        );
327        // Comma-only filename collapses to empty and is skipped.
328        assert_eq!(parse_api_entry(repo, "sounds/cat/,,,.ogg"), None);
329    }
330}