1 : /** @file
2 : * @author Enrico Zini <enrico@enricozini.org>
3 : * Correlate popcon data with local popcon information
4 : */
5 :
6 : /*
7 : * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
8 : *
9 : * This program is free software; you can redistribute it and/or modify
10 : * it under the terms of the GNU General Public License as published by
11 : * the Free Software Foundation; either version 2 of the License, or
12 : * (at your option) any later version.
13 : *
14 : * This program is distributed in the hope that it will be useful,
15 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 : * GNU General Public License for more details.
18 : *
19 : * You should have received a copy of the GNU General Public License
20 : * along with this program; if not, write to the Free Software
21 : * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 : */
23 :
24 : #include <ept/popcon/local.h>
25 : #include <ept/popcon/popcon.h>
26 : #include <ept/popcon/maint/path.h>
27 :
28 : #include <wibble/exception.h>
29 :
30 : #include <algorithm>
31 : #include <fstream>
32 : #include <cmath>
33 :
34 : //#include <iostream>
35 :
36 : using namespace std;
37 :
38 : namespace ept {
39 : namespace popcon {
40 :
41 : // Split a string where there are separators
42 1903 : static vector<string> split(const std::string& str, char sep = ' ')
43 : {
44 1903 : vector<string> res;
45 1903 : size_t start = 0;
46 10435 : while (start < str.size())
47 : {
48 8531 : size_t end = str.find(sep, start);
49 8531 : if (end == string::npos)
50 : {
51 1902 : res.push_back(str.substr(start));
52 1902 : break;
53 : }
54 : else
55 : {
56 6629 : res.push_back(str.substr(start, end-start));
57 6629 : start = end + 1;
58 : }
59 : }
60 0 : return res;
61 : }
62 :
63 : // Reverse sort pairs by comparing their second element
64 : struct secondsort
65 : {
66 0 : bool operator()(const pair<string, float>& a, const pair<string, float>& b) const
67 : {
68 0 : if (a.second == b.second)
69 0 : return a.first > b.first;
70 : else
71 0 : return a.second > b.second;
72 : }
73 : };
74 :
75 1 : Local::Local(const std::string& file)
76 : {
77 1 : m_timestamp = Path::timestamp(file);
78 1 : if (m_timestamp == 0)
79 0 : return;
80 :
81 1 : ifstream in;
82 1 : in.open(file.c_str());
83 1 : if (!in.good())
84 0 : throw wibble::exception::File(file, "opening file for reading");
85 :
86 1907 : while (!in.eof())
87 : {
88 1905 : std::string line;
89 1905 : getline(in, line);
90 1905 : if (line.substr(0, 10) == "POPULARITY")
91 3 : continue;
92 1904 : if (line.substr(0, 14) == "END-POPULARITY")
93 : continue;
94 1903 : vector<string> data = split(line);
95 1903 : if (data.size() < 4)
96 : continue;
97 1902 : if (data[3] == "<NOFILES>")
98 : // This is an empty / virtual package
99 979 : m_scores.insert(make_pair(data[2], 0.1));
100 923 : else if (data.size() == 4)
101 : // Package normally in use
102 0 : m_scores.insert(make_pair(data[2], 1.0));
103 923 : else if (data[4] == "<OLD>")
104 : // Unused packages
105 745 : m_scores.insert(make_pair(data[2], 0.3));
106 178 : else if (data[4] == "<RECENT-CTIME>")
107 : // Recently installed packages
108 178 : m_scores.insert(make_pair(data[2], 0.5));
109 1 : }
110 0 : }
111 :
112 2 : float Local::score(const std::string& pkg) const
113 : {
114 2 : std::map<std::string, float>::const_iterator i = m_scores.find(pkg);
115 2 : if (i == m_scores.end())
116 0 : return 0;
117 : else
118 2 : return i->second;
119 : }
120 :
121 : /**
122 : * Return the TFIDF score of the package computed against the popcon
123 : * information.
124 : */
125 1 : float Local::tfidf(const Popcon& popcon, const std::string& pkg) const
126 : {
127 1 : float popconScore = popcon.score(pkg);
128 : //cerr << pkg << ": " << score(pkg) << " * log(" << (float)popcon.submissions() << " / " << popconScore << ") = " << score(pkg) * log((float)popcon.submissions() / popconScore) << endl;
129 1 : if (popconScore == 0)
130 0 : return 0;
131 : else
132 1 : return score(pkg) * log((float)popcon.submissions() / popconScore);
133 :
134 : }
135 :
136 0 : std::vector< std::pair<std::string, float> > Local::scores() const
137 : {
138 0 : vector< pair<string, float> > res;
139 : // Copy the scores in res
140 0 : copy(m_scores.begin(), m_scores.end(), back_inserter(res));
141 : // Sort res by score
142 0 : sort(res.begin(), res.end(), secondsort());
143 0 : return res;
144 : }
145 :
146 0 : std::vector< std::pair<std::string, float> > Local::tfidf(const Popcon& popcon) const
147 : {
148 0 : vector< pair<string, float> > res;
149 : // Compute the tfidf scores and store them into res
150 0 : for (std::map<std::string, float>::const_iterator i = m_scores.begin();
151 : i != m_scores.end(); ++i)
152 : {
153 0 : float popconScore = popcon.score(i->first);
154 0 : if (popconScore == 0)
155 0 : res.push_back(make_pair(i->first, 0.0f));
156 : else
157 : res.push_back(make_pair(i->first,
158 0 : i->second * log((float)popcon.submissions() / popconScore)));
159 : }
160 : // Sort res by score
161 0 : sort(res.begin(), res.end(), secondsort());
162 0 : return res;
163 : }
164 :
165 : }
166 6 : }
167 :
168 : // vim:set ts=4 sw=4:
|