Skip to content

Commit e3aaf3a

Browse files
committed
code
1 parent 104f9c1 commit e3aaf3a

File tree

5 files changed

+33
-28
lines changed

5 files changed

+33
-28
lines changed

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ STACK=stack
44
WGET=wget
55
WIKILANG=de
66

7-
docs/known.$(WIKILANG).txt docs/unknown.$(WIKILANG).txt) docs/subseq.$(WIKILANG).txt: extracted.$(WIKILANG)
7+
docs/$(WIKILANG)/known.txt docs/$(WIKILANG)/known-subseq.txt: extracted.$(WIKILANG)
88
$(STACK) build
9-
$(STACK) exec wikiwc -- -w words.$(WIKILANG) -k docs/$(WIKILANG)/known.txt -u docs/$(WIKILANG)/unknown.txt -s docs/$(WIKILANG)/known-subseq.txt extracted.$(WIKILANG)/*/wiki_* +RTS -N
9+
$(STACK) exec wikiwc -- -w words.$(WIKILANG) -k docs/$(WIKILANG)/known.txt -s docs/$(WIKILANG)/known-subseq.txt extracted.$(WIKILANG)/*/wiki_* +RTS -N
1010

1111
clean:
1212
rm wikiwc.$(WIKILANG) $(WIKILANG)wiki-latest-pages-articles.xml.bz2

app/Main.hs

+10-13
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
1-
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
1+
{-# LANGUAGE BangPatterns #-}
22
{-# LANGUAGE OverloadedStrings #-}
33
{-# LANGUAGE RecordWildCards #-}
44
module Main where
55

6-
import Control.Concurrent.Async (mapConcurrently)
7-
import Control.Monad (foldM)
8-
import Data.Foldable (foldl')
9-
import Data.Text (Text)
6+
import Control.Concurrent.Async (mapConcurrently)
7+
import Control.Monad (foldM)
8+
import Data.Foldable (foldl')
9+
import Data.Text (Text)
1010
import qualified Data.Text as Text
1111
import qualified Data.Text.IO as Text
12-
import System.FilePath.Glob (glob)
13-
import System.Environment (getArgs)
14-
import WordFreq
15-
import Options.Applicative
12+
import Options.Applicative
13+
import WordFreq
1614

1715
data Options = Options {
1816
knownWordListFiles :: [FilePath]
@@ -32,8 +30,8 @@ programOptions =
3230

3331
wikiwc :: Options -> IO ()
3432
wikiwc Options{..} = do
35-
words <- foldl' mappend mempty <$> mapConcurrently loadWordList knownWordListFiles
36-
wc <- normalize <$> foldM loadWikiDump mempty wikiExtractorFiles
33+
!words <- foldl' mappend mempty <$> mapConcurrently loadWordList knownWordListFiles
34+
!wc <- normalize <$> foldM loadWikiDump mempty wikiExtractorFiles
3735
let known = filterWordFreq (knownWord words) (const True) wc
3836
let unknown = filterWordFreq (not . knownWord words) (const True) wc
3937
case knownOutput of
@@ -55,5 +53,4 @@ main = wikiwc =<< execParser opts where
5553
<> header "wikiwc - Wikipedia word counter"
5654

5755
toText :: [(Int, Text)] -> Text
58-
toText = foldMap f where
59-
f (a, w) = Text.pack (show a) <> " " <> w <> "\n"
56+
toText = foldMap $ \(a, w) -> Text.pack (show a) <> " " <> w <> "\n"

package.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ dependencies:
2020
- base >= 4.7 && < 5
2121
- containers
2222
- extra
23-
- Glob
2423
- optparse-applicative
2524
- parallel
2625
- text

src/WordFreq.hs

+20-11
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
{-# LANGUAGE BangPatterns #-}
22
{-# LANGUAGE OverloadedStrings #-}
3+
{-# LANGUAGE RankNTypes #-}
34
{-# LANGUAGE TupleSections #-}
45
module WordFreq (
56
loadWikiDump, WordFreq, filterWordFreq, foldWordFreq, addWord
67
, commonSequences, normalize, byFrequency
78
, loadWordList, knownWord
89
) where
9-
10+
import Debug.Trace (traceShow)
1011
import Control.Applicative ((<|>))
1112
import Control.Monad (void)
1213
import Control.Monad.Extra (ifM)
1314
import Control.Parallel.Strategies
14-
import Data.Attoparsec.Text
15+
import Data.Attoparsec.Text hiding (take)
1516
import Data.Char (isAlpha)
1617
import Data.Foldable (foldl')
17-
import Data.List (groupBy, sort, sortOn)
18+
import Data.List (groupBy, sort, sortBy, sortOn)
1819
import Data.Map.Strict (Map)
1920
import Data.IntMap.Strict (IntMap)
2021
import qualified Data.Map.Strict as Map
@@ -70,27 +71,35 @@ commonSequences = WordFreq
7071
. deleteInfix . Map.foldrWithKey' subseqs mempty
7172
. unWordFreq where
7273
subseqs :: Text -> Int -> Map Text Int -> Map Text Int
73-
subseqs k a !m = let l = Text.length k
74+
subseqs k a !m = let lc = Text.toLower k
75+
l = Text.length lc
7476
in foldl' (flip $ uncurry $ Map.insertWith (+)) m [
75-
(Text.toLower $ Text.take n $ Text.drop i k, a)
77+
(Text.take n $ Text.drop i lc, a)
7678
| i <- [0 .. l - 1], n <- [1 .. l - i]
7779
]
7880
deleteInfix m = Map.withoutKeys m $ mconcat $
7981
withStrategy (parList rdeepseq) $ map search $
80-
foldMap (breakDownBy Text.length) $
82+
foldMap (breakBy $ Down . Text.length) $
8183
Map.foldrWithKey' invert mempty m where
8284
invert k a = IntMap.insertWith mappend a [k]
8385
search (l, r) = Set.fromList $ concatMap (\x -> filter (`Text.isInfixOf` x) r) l
84-
breakDownBy by = go . sortOn (Down . by) where
86+
breakBy by = go . sortOn fst . map (\a -> (by a, a)) where
87+
chunksOf i ls = map (take i) (splitter ls) where
88+
splitter [] = []
89+
splitter l = l : splitter (drop i l)
8590
go [] = []
86-
go (x:xs) = let b = by x in case break ((b /=) . by) xs of
91+
go ((b, a):xs) = case break ((b /=) . fst) xs of
8792
(_, []) -> []
88-
(xs', ys) -> (x:xs', ys) : go ys
93+
(xs', ys) -> let !ys' = map snd ys
94+
pieces = chunksOf 1000 $ a:map snd xs'
95+
in map (, ys') pieces ++ go ys
8996

9097
byFrequency :: WordFreq Text -> [(Int, Text)]
91-
byFrequency = IntMap.foldlWithKey' g [] . Map.foldrWithKey' f mempty . unWordFreq where
98+
byFrequency = IntMap.foldlWithKey' g []
99+
. withStrategy (parTraversable rdeepseq) . fmap sort
100+
. Map.foldrWithKey' f mempty . unWordFreq where
92101
f k a = IntMap.insertWith (const (k:)) a [k]
93-
g xs a ks = map (a,) (sort ks) ++ xs
102+
g xs a ks = map (a,) ks ++ xs
94103

95104
loadWordList :: FilePath -> IO (Set Text)
96105
loadWordList = fmap (Set.fromList . map Text.toLower . Text.lines)

stack.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
resolver: lts-12.11
1+
resolver: lts-12.13
22
packages:
33
- .

0 commit comments

Comments
 (0)