-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_type_utils.go
70 lines (64 loc) · 1.86 KB
/
file_type_utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
package markitdown
import (
"fmt"
"net/http"
"path/filepath"
"strings"
)
// getFileType determines the file type from an HTTP response that can be used for conversion.
// It returns a simple file type string (e.g., "html", "pdf") that can be mapped to converters.
// Returns an error if the file type is unsupported or cannot be determined.
func getFileType(resp *http.Response, url string) (string, error) {
if resp == nil {
return "", fmt.Errorf("http response is nil")
}
// First try Content-Type header
contentType := resp.Header.Get("Content-Type")
if contentType != "" {
// Strip any charset or boundary information
if idx := strings.Index(contentType, ";"); idx != -1 {
contentType = contentType[:idx]
}
contentType = strings.TrimSpace(contentType)
// Map MIME types to file types
switch contentType {
case "text/html", "application/xhtml+xml":
return "html", nil
case "application/pdf":
return "pdf", nil
case "application/epub+zip":
return "epub", nil
case "application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return "doc", nil
case "text/markdown":
return "md", nil
case "text/plain":
return "txt", nil
}
}
// Fallback to URL extension
ext, err := getFileTypeFromPath(url)
if err != nil {
return "", fmt.Errorf("failed to determine file type from URL: %w", err)
}
return ext, nil
}
// getFileTypeFromPath determines the file type from a file path.
func getFileTypeFromPath(filePath string) (string, error) {
ext := strings.ToLower(filepath.Ext(filePath))
switch ext {
case ".html", ".htm":
return "html", nil
case ".pdf":
return "pdf", nil
case ".epub":
return "epub", nil
case ".doc", ".docx":
return "doc", nil
case ".md", ".markdown":
return "md", nil
case ".txt":
return "txt", nil
}
return "", fmt.Errorf("unsupported or unknown file type: %s", ext)
}